library(HDCRISPR2019)
library(ROCR)
library(clusterProfiler)
#> 
#> Registered S3 method overwritten by 'enrichplot':
#>   method               from
#>   fortify.enrichResult DOSE
#> clusterProfiler v3.14.3  For help: https://guangchuangyu.github.io/software/clusterProfiler
#> 
#> If you use clusterProfiler in published research, please cite:
#> Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
library(limma)
library(org.Hs.eg.db)
#> Loading required package: AnnotationDbi
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#> 
#>     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#>     clusterExport, clusterMap, parApply, parCapply, parLapply,
#>     parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following object is masked from 'package:limma':
#> 
#>     plotMA
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
#>     as.data.frame, basename, cbind, colnames, dirname, do.call,
#>     duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#>     lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#>     pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
#>     tapply, union, unique, unsplit, which, which.max, which.min
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> Loading required package: IRanges
#> Loading required package: S4Vectors
#> Warning: package 'S4Vectors' was built under R version 3.6.3
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:base':
#> 
#>     expand.grid
#> 
library(preprocessCore)
library(mixtools)
#> mixtools package, version 1.2.0, Released 2020-02-05
#> This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.
library(Biostrings)
#> Loading required package: XVector
#> 
#> Attaching package: 'Biostrings'
#> The following object is masked from 'package:base':
#> 
#>     strsplit
library(gscreend)
library(SummarizedExperiment)
#> Loading required package: GenomicRanges
#> Loading required package: GenomeInfoDb
#> Warning: package 'GenomeInfoDb' was built under R version 3.6.3
#> Loading required package: DelayedArray
#> Warning: package 'DelayedArray' was built under R version 3.6.3
#> Loading required package: matrixStats
#> 
#> Attaching package: 'matrixStats'
#> The following objects are masked from 'package:Biobase':
#> 
#>     anyMissing, rowMedians
#> Loading required package: BiocParallel
#> 
#> Attaching package: 'DelayedArray'
#> The following objects are masked from 'package:matrixStats':
#> 
#>     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
#> The following object is masked from 'package:clusterProfiler':
#> 
#>     simplify
#> The following objects are masked from 'package:base':
#> 
#>     aperm, apply, rowsum
library(reshape2)
library(eulerr)
library(UpSetR)
library(gridExtra)
#> 
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:Biobase':
#> 
#>     combine
#> The following object is masked from 'package:BiocGenerics':
#> 
#>     combine
library(pheatmap)
library(ggrastr)
library(cowplot)
library(ggrepel)
#> Loading required package: ggplot2
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
#> ✓ tibble  3.0.3     ✓ dplyr   1.0.2
#> ✓ tidyr   1.1.2     ✓ stringr 1.4.0
#> ✓ readr   1.3.1     ✓ forcats 0.5.0
#> ✓ purrr   0.3.4
#> ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
#> x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
#> x dplyr::collapse()   masks Biostrings::collapse(), IRanges::collapse()
#> x dplyr::combine()    masks gridExtra::combine(), Biobase::combine(), BiocGenerics::combine()
#> x purrr::compact()    masks XVector::compact()
#> x dplyr::count()      masks matrixStats::count()
#> x dplyr::desc()       masks IRanges::desc()
#> x tidyr::expand()     masks S4Vectors::expand()
#> x dplyr::filter()     masks stats::filter()
#> x dplyr::first()      masks S4Vectors::first()
#> x dplyr::lag()        masks stats::lag()
#> x purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
#> x dplyr::rename()     masks S4Vectors::rename()
#> x dplyr::select()     masks AnnotationDbi::select()
#> x purrr::simplify()   masks DelayedArray::simplify(), clusterProfiler::simplify()
#> x dplyr::slice()      masks XVector::slice(), IRanges::slice()

Design

Data

We require a number of datasets for downstream analyses that we first load into memory.

## core-essential and non-essential genes
data('cene', package='HDCRISPR2019')

Overview library composition

Determination of validated sgRNAs

BAGEL evalutation of dropout screens

We previously re-evaluated 439 dropout screen in GenomeCRISPR using BAGEL. We use the calculated Bayes Factors and corresponding log2 fold changes to determine hits at 5% false discovery rate (FDR). We then select the sgRNAs that contributed to each of the hits by showing a dropout phenotype. These sgRNAs will then be selected for the HD CRISPR library. If an sgRNA showed a phenotype in more than one screen, we prefer it over other sgRNAs with the same target gene.

BAGEL results hit calling

First we read the Bayes Factors as returned by BAGEL and use the ROCR package with the core-essential gene set 2 and the non-essential gene set (Hart 2017, G3) to determine hits at 5% FDR.

## load BAGEL results for GenomeCRISPR screens
data('bagel_results', package='HDCRISPR2019')

## generate ROC curves
roc_results <- lapply(bagel_results, function(screen){
  data <- screen %>% mutate(label=ifelse(GENE %in% ce$symbol, 1,
                                  ifelse(GENE %in% ne$symbol, 0, -1))) %>% 
    filter(label != -1) 
  pred <- prediction(data$BF, data$label)
  perf <- performance(pred, measure='prec', x.measure='rec')
  ## also calculate auc
  auc <- performance(pred, measure='auc')@y.values[[1]]
  cutoff <- perf@alpha.values[[1]][max(which(perf@y.values[[1]] > 0.95))]
  return(tibble(
    pubmed = data$pubmed[1],
    cellline = data$cellline[1],
    condition = data$condition[1],
    precision = perf@y.values[[1]],
    recall = perf@x.values[[1]],
    cutoff = cutoff,
    auc = auc
  ))
})

To use only high-quality screens for downstream processing, we check the quality of the screens by plotting the ROC curves. We evalute overall screen quality by the area under the precision-recall-curve. We retain screens for futher analysis if the area under the curve (AUC) is greater than 0.9.

## plot ROC curves to check whether all screens are OK
roc_results %>% bind_rows() %>% unite(screen, pubmed, cellline, condition, sep='_', remove=F) %>% 
  ggplot(aes(x=recall, y=precision, group=screen, colour=pubmed)) + 
  geom_line() + theme_classic() + geom_hline(yintercept=0.95) + 
  theme(legend.position = 'bottom') + 
  ylim(c(0.1, 1)) + xlim(c(0.1, 1)) 

plot of chunk unnamed-chunk-5


## highlight screens ROC < 0.9
roc_results %>% bind_rows() %>% 
  unite(screen, pubmed, cellline, condition, sep='_', remove=F) %>% 
  mutate(keep = ifelse(auc > 0.9, T, F)) %>%
  ggplot(aes(x=recall, y=precision, group=screen, colour=keep)) + 
  geom_line() + theme_classic() + geom_hline(yintercept=0.95) + 
  theme(legend.position = 'bottom') + 
  ylim(c(0.1, 1)) + xlim(c(0.1, 1)) +
  scale_colour_manual(values = c('#db4437', '#4285f4'))

plot of chunk unnamed-chunk-5

Some of the screens have low AUC values. We exclude these from the data before continuing with downstream analyses. We keep only screens with an AUC greater than 0.9. We plot the ROC curves again to visualize retained and excluded screens based on their precision-recall-curves.

## remove auc < 0.9
roc_results_fil <- roc_results %>% bind_rows() %>% 
  filter(auc > 0.9, !is.na(cutoff))

## plot roc curves again
roc_results_fil %>% 
  unite(screen, pubmed, cellline, condition, sep='_', remove=F) %>% 
  ggplot(aes(x=recall, y=precision, group=screen, colour=pubmed)) + 
  geom_line() + theme_classic() + geom_hline(yintercept=0.95) +  
  theme(legend.position = 'bottom') + 
  ylim(c(0.1, 1)) + xlim(c(0.1, 1))

plot of chunk unnamed-chunk-6

We next load the corresponding log2-fold changes to get sgRNA level dropout phenotypes for each screens.

## read fold changes object from disk
data('fchanges', package = 'HDCRISPR2019')

BAGEL results quality control

To assess if the BAGEL analysis worked as intended we generate a number of quality control visualizations. We first plot a histogram of all selected cut-offs for gene essentiality.

roc_results_fil %>%
  distinct(pubmed, cellline, condition, cutoff) %>%
  .$cutoff %>% hist()

plot of chunk unnamed-chunk-8

Most screens have a cutoff that matches what we would have expected. For some, however, the cutoff is quite high and for others it is considerably smaller than 0 (we would not expect any < 0 at all). It seems that these screens are mostly the ones performed using the library by Wang et al. We produce boxplots/density plots for the Wang 2015 screens (which are known to be of good quality).

fchanges %>% filter(pubmed == '26472758') %>%
  mutate(type = ifelse(GENE %in% ce$symbol, 'ce',
                ifelse(GENE %in% ne$symbol, 'ne', 'other'))) %>%
    ggplot(aes(fc, fill=type)) + geom_density() +
    facet_wrap(~cellline) +  theme_bw() +
    theme(panel.grid=element_blank())

plot of chunk unnamed-chunk-9


fchanges %>% filter(pubmed == '27260157') %>%
  mutate(type = ifelse(GENE %in% ce$symbol, 'ce',
                ifelse(GENE %in% ne$symbol, 'ne', 'other'))) %>%
    ggplot(aes(fc, fill=type)) + geom_density() +
    facet_wrap(~cellline) +  theme_bw() +
    theme(panel.grid=element_blank())

plot of chunk unnamed-chunk-9

These look fine. It could be that these screens separate core- and non-essential genes so well that the cutoff becomes negative to generate the 5% FDR that we allow. This might not be too surprising as these screens were used to actually generate the core- and nonessential gene lists. We can confirm this by selecting cutoffs for different false discovery rates.

## 4 screens with negative cutoff
negscreens <- list(
  list(pubmed = '26472758', cellline = 'Jiyoye'),
  list(pubmed = '24336569', cellline = 'KBM7'),
  list(pubmed = '27260157', cellline = 'RKO'),
  list(pubmed = '26472758', cellline = 'KBM7')
)

par(mfrow=c(2,2))
for(screen in negscreens){
  wscreen <- bagel_results %>% bind_rows() %>%
    filter(pubmed == screen$pubmed, cellline == screen$cellline) %>%
    mutate(label=ifelse(GENE %in% ce$symbol, 1,
                 ifelse(GENE %in% ne$symbol, 0, -1))) %>%
    filter(label != -1)

  wpred <- prediction(wscreen$BF, wscreen$label)
  wperf <- performance(wpred, measure='prec', x.measure='rec')
  cos <- lapply(seq(1, 0.9, by= -0.01), function(x){
    ifelse(length(which(wperf@y.values[[1]] > x)) == 0, NA,
      wperf@alpha.values[[1]][max(which(wperf@y.values[[1]] > x))]
    )
  })
  plot(seq(1, 0.9, by= -0.01), unlist(cos),
       xlab='1 - FDR', ylab='BF cutoff')
  if(max(unlist(cos), na.rm=T) > 0){
    abline(h=0)
  }
}

plot of chunk unnamed-chunk-10

This confirms what we expected. Hence we will just set the cutoff for these screens to 0.

roc_results_fil <- roc_results_fil %>%
  mutate(cutoff = ifelse(cutoff < 0, 0, cutoff))

We further compare a screen with a large AUC to a screen with a small AUC.

fchanges %>%
  filter(pubmed == '29083409', cellline %in% c('U178', 'GB1')) %>%
  mutate(type = ifelse(GENE %in% ce$symbol, 'ce',
                ifelse(GENE %in% ne$symbol, 'ne', 'other'))) %>%
  ggplot(aes(fc, fill=type)) + geom_density() +
  facet_wrap(~cellline) +  theme_bw() +
  theme(panel.grid=element_blank())

plot of chunk unnamed-chunk-12

This matches what we would have expected. While the separation is very clear in the screen with the high AUC, it is much smaller than in the screen with the low AUC value.

Merging data

We merge Bayes Factors and sgRNA fold changes into a shared data frame for downstream analysis.

## merge and apply cutoff.
merged <- fchanges %>%
  inner_join(roc_results_fil %>% distinct(pubmed, cellline,
                                          condition, cutoff)) %>%
  inner_join(bagel_results %>% bind_rows() %>%
               dplyr::select(pubmed, cellline,
                             condition, GENE, BF)) %>%
  mutate(hit = ifelse(BF > cutoff & BF > 0, T, F))

How many essentials do we get per screen? Is there anything unexpected going on?

## count essential genes per screen
ness_screen <- merged %>% distinct(pubmed, cellline, condition,
                                   cutoff, GENE, hit) %>%
  group_by(pubmed, cellline, condition, cutoff) %>%
  summarise(ness = sum(hit)) %>% ungroup()

hist(ness_screen$ness)

plot of chunk unnamed-chunk-14

This looks fine. The screens with low amounts of essential genes are screens with smaller libraries where only a limited number of genes were screened. Screens with the Yusa library were done use shorter 19 bp sgRNAs. We extend these sequences to 20 bp, which is what we want to use in our library. For simplicity, we assume that the 1 bp difference does not have a major impact on sgRNA performance.

data('yusa_map', package = 'HDCRISPR2019')

## adapt yusa sequences to 23 bp
yusa_adapted <- merged %>% filter(nchar(SEQID) == 22) %>%
  left_join(yusa_map %>% dplyr::rename(SEQID=short)) %>%
  mutate(SEQID=long) %>%
  filter(!is.na(long)) %>% distinct() %>% dplyr::select(-long)

## add back to the merged data
merged <- merged %>% filter(nchar(SEQID) != 22) %>%
  bind_rows(yusa_adapted)

## the data includes one CRISPRi screen that we remove
merged <- merged %>% filter(pubmed != '27661255')

Frequent hitters

As an additional validation we generate a plot that shows the fraction of cell lines in which selected genes are essential.

## number of hits per gene
hits_per_gene <- merged %>% filter(hit) %>%
  distinct(pubmed, cellline, condition, GENE) %>% count(GENE)

## hit frequencey per gene
freq_hits <- hits_per_gene %>%
  inner_join(merged %>% distinct(pubmed, cellline, condition, GENE) %>%
               count(GENE) %>% dplyr::rename(n_screens=n)) %>%
  mutate(fraction=round(n/n_screens, 2)*100)

## some annotation
oncogene <- c('NRAS', 'KRAS', 'BRAF', 'PIK3CA',
              'CTNNB1', 'MYC', 'MYB', 'AKT1', 'EGFR')

proteasome <- ce %>% filter(grepl('^PSM', symbol)) %>% .$symbol
spliceosome <- ce %>% filter(grepl('^SF', symbol)) %>% .$symbol
polymerases <- ce %>% filter(grepl('^POLR', symbol)) %>% .$symbol

plot_data <- freq_hits %>% arrange(desc(fraction)) %>%
  mutate(rank=1:n(),
         group=ifelse(GENE %in% oncogene, 'oncogenes',
               ifelse(GENE %in% proteasome, 'proteasome',
               ifelse(GENE %in% spliceosome, 'spliceosome',
               ifelse(GENE %in% polymerases, 'polymerases',
                      'rest')))))

plot_data %>% ggplot(aes(rank, fraction)) +
  geom_point(data=filter(plot_data, group == 'rest'),
             colour='#cccccc') +
  geom_point(data=filter(plot_data, group != 'rest'),
             aes(colour=group)) +
  theme_classic() +
  scale_colour_manual(values=c('#e41a1c', '#ff7f00',
                               '#377eb8', '#4daf4a')) +
  xlab('gene rank') + ylab('hit fraction (in %)')

plot of chunk unnamed-chunk-16

This looks as expected. Core-essential processes are essential in > 75 % of all cell lines, while oncogenes are essential only in specific contexts.

Selection of validated sgRNAs

In the next step we select empirically 'validated' sgRNAs based on the data set generated in previous sections. We select all genes that are called 'hits' (or essential) and select the sgRNAs targeting these hits and that rank amongst the 20% most depleted sgRNAs in the whole screen. We choose 20% cutoff as we expect up to 20% of the genes to be essential in a screen. This is a relaxed threshold and the true fraction of essential genes in most screens is likely to be around 10%.

## sgRNAs with strong depletion phenotypes that
## target essential genes
valid_sgRNAs <- merged %>%
  group_by(pubmed, cellline, condition) %>%
  mutate(q20=quantile(fc, probs=0.2)) %>%
  ungroup() %>%
  filter(hit, fc < q20)

## we also remember the bad sgRNAs so they can be excluded later
invalid_sgRNAs <- merged %>%
  group_by(pubmed, cellline, condition) %>%
  mutate(q20=quantile(fc, probs=0.2)) %>%
  ungroup() %>%
  filter(hit, fc > q20)

Now we count for each unique sgRNA in the determined set of sgRNAs the number of screens it showed a phenotype in.

valid_sgRNAs <- valid_sgRNAs %>% count(GENE, SEQID) %>% 
  arrange(GENE, desc(n))

We find 153016 unique sgRNA sequences targeting in total 16221 genes.

## histogram of hit count
valid_sgRNAs %>% ggplot(aes(n)) + 
  geom_histogram(bins=50) + 
  ggtitle('Number of hits per sgRNA')

plot of chunk unnamed-chunk-19

Examples of valid and invalid sgRNAs for core-essential genes.

## Example: SF1 in Jiyoye
df <- fchanges %>% filter(cellline == 'Jiyoye') %>% 
  mutate(SF1 = ifelse(GENE == 'SF1', T, F), 
         hit = ifelse(SEQID %in% 'GTAATGGGTGTGGGAAGCTGTGG', F, T))

## visualize as histogram
ggplot() + geom_histogram(data = df, aes(fc)) +
  geom_vline(xintercept = df %>% filter(SF1, hit) %>% pull(fc),
             colour = '#4285f4') +
  geom_vline(xintercept = df %>% filter(SF1, !hit) %>% pull(fc),
             colour = '#db4437') + 
  geom_vline(xintercept = quantile(df$fc, probs=0.2),
             linetype = 'dashed') + 
  xlab('fold change [log2]') + 
  ggtitle('SF1-targeting sgRNAs in Jiyoye cells')

plot of chunk unnamed-chunk-20


## visualize as jitter plot
fchanges %>% filter(pubmed == '26472758', GENE == 'NOP2') %>% 
  mutate(invalid = ifelse(SEQID %in% 'GGAGCATTAAATAGGGACTGGGG', F, T),
         fc = ifelse(fc < -4, -4, fc)) %>%
  group_by(SEQID) %>% mutate(mean_fc = mean(fc)) %>% ungroup() %>%
  arrange(desc(mean_fc)) %>%
  mutate(SEQID = factor(SEQID, levels = unique(SEQID))) %>%
  ggplot(aes(SEQID, fc, colour = invalid)) + 
  geom_jitter(width = 0.2) + 
  stat_summary(fun.y = 'mean', 
               fun.ymin = 'mean', 
               fun.ymax = 'mean', 
               geom='crossbar', 
               color = 'red', width = 0.5) + 
  geom_hline(yintercept = 0, linetype = 'dashed') +
  ylab('fold change [log2]') + xlab('') +
  scale_colour_manual(values = c('#4285f4', '#111111')) +
  coord_flip() + 
  theme_cowplot() + 
  ggtitle('NOP2 sgRNA phenotypes')

plot of chunk unnamed-chunk-20

Annotated pool of sgRNAs

We load an annotated pool of sgRNAs that compiles all relevant information.

data('sgRNA_pool_refined', package = 'HDCRISPR2019')

We visualize core-essential targeting sgRNAs, comparing validated in ineffective guides. We do a similar analysis for non-essential gene-targeting sgRNAs comparing toxic and non-toxic guides.

## list of 'invalid' sequences
inv_seqs <- invalid_sgRNAs %>% 
  mutate(nopam = substr(SEQID, 1, 20)) %>% 
  pull(nopam)

## density plot of 'valid' and 'invalid' sgRNAs (core-essential)
merged %>% filter(GENE %in% ce$symbol) %>% 
  mutate(nopam = substr(SEQID, 1, 20)) %>%
  mutate(ineffective = ifelse(nopam %in% inv_seqs, T, F)) %>% 
  ggplot(aes(fc, colour = ineffective)) + 
  geom_density(bw = 0.5) + 
  scale_y_continuous(expand = c(0,0)) + 
  xlab('fold change [log2]') + xlim(c(-13, 5)) + 
  geom_vline(xintercept = 0, linetype = 'dashed')

plot of chunk unnamed-chunk-22


## similar plot for non-essential genes
noness_outliers <- pool %>% filter(genes_hit %in% ne$symbol) %>% 
  mutate(outlier = ifelse(effect_z < -1.25, T, F)) %>% 
  filter(outlier) %>% pull(sequence)

merged %>% filter(GENE %in% ne$symbol) %>% 
  mutate(outlier = ifelse(SEQID %in% noness_outliers, T, F)) %>% 
  ggplot(aes(fc, colour = outlier)) + geom_density() + 
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  scale_y_continuous(expand = c(0, 0)) + xlim(c(-5, 3))

plot of chunk unnamed-chunk-22

How many sgRNAs do we have for each gene?

## count number of guides per gene
guides_per_gene <- pool %>% count(genes_hit) 

## generate histogram
guides_per_gene %>% ggplot(aes(n)) + geom_histogram(bins = 30) + xlab('sgRNAs per gene')

plot of chunk unnamed-chunk-23


## calculate mean/median
guides_per_gene %>% summarise(median = median(n), avg = mean(n))
#> # A tibble: 1 x 2
#>   median   avg
#>    <dbl> <dbl>
#> 1     25  24.7

Removing BbsI restriction sites and setting minimal number of screens

Some additional restrictions: require minimum number of screens for consideration and remove BbsI restriction sites.

is_bbs1 <- function(sequence){
  bbs1 <- c(
    which(grepl('GAAGAC', sequence)),
    which(grepl(system('echo GAAGAC | tr "ACGT" "TGCA" | rev', intern=T), sequence))
  )
  res <- logical(length=length(sequence))
  res[bbs1] <- TRUE
  return(res)
}

## consider only performance based on at least 5 screens
pool <- pool %>% mutate(screen_hit_ratio = ifelse(n_screens >= 5, screen_hit_ratio, 0))
## remove BbsI restriction sites
pool <- pool %>% filter(!is_bbs1(nopam))

How do we avoid enriching for off-targets

We exclude sgRNAs that show phenotypes that do not fit with the other designs targeting the same gene.

## a distribution of effect Z-scores
pool %>% mutate(group = ifelse(genes_hit %in% ne$symbol, 'non-essential', 
                        ifelse(genes_hit %in% ce$symbol, 'core-essential', 
                               'other'))) %>% 
  filter(group != 'other') %>% 
  ggplot(aes(effect_z, mean_log2fc)) + 
  geom_hex(bins=100, color = '#000000') + 
  geom_hline(yintercept = 0, linetype='dashed') + geom_vline(xintercept = c(-1.25, 1.25), 
             linetype = 'dashed', colour = '#db4437') +
  xlab('GenomeCRISPR effect score [Z-score]') + 
  facet_wrap(~group) + 
  ylab('Average log2 fold change')

plot of chunk unnamed-chunk-25


## KRT35 example
krt35_offtarget <- pool %>% filter(genes_hit %in% ne$symbol, effect_z < -3) %>% 
  filter(genes_hit == 'KRT35') %>% pull(sequence)

fchanges %>% filter(GENE == 'KRT35', pubmed == '26472758') %>% 
  mutate(offtarget = ifelse(SEQID == krt35_offtarget, T, F)) %>% 
  ggplot(aes(fc, SEQID, colour = offtarget)) + geom_jitter() + 
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  scale_colour_manual(values = c('#444444', '#4285f4')) + 
  ggtitle('KRT35 phenotypes in Wang, 2015')

plot of chunk unnamed-chunk-25

Summary statistics of final library

We parse the final library sequence lists from disk.

data('hdcrispr_seqs', package = 'HDCRISPR2019')

We calculate the percentage of validated sgRNAs for each sub-library.

valid_seqs <- valid_sgRNAs %>% filter(n>=1) %>% 
  mutate(nopam = substr(SEQID, 1, 20)) %>% pull(nopam)

## annoate maximal screen-hit-ratio per gene
pool <- pool %>% group_by(genes_hit) %>% mutate(max_shr = max(screen_hit_ratio)) %>% ungroup()

## calculate percentages, make pie chart.
n_a <- libA_seqs %>% dplyr::select(nopam) %>% 
  left_join(pool %>% dplyr::select(nopam, screen_hit_ratio, max_shr, in_libraries)) %>% 
  mutate(valid = ifelse(is.na(in_libraries), 'de_novo',
                 ifelse(is.na(screen_hit_ratio), 'empirical_nonessential', 
                 ifelse(screen_hit_ratio > 0 & max_shr > 0.05, 'empirical_essential', 
                        'empirical_nonessential'))))

n_b <- libB_seqs %>% dplyr::select(nopam) %>% 
  left_join(pool %>% dplyr::select(nopam, screen_hit_ratio, max_shr, in_libraries)) %>% 
  mutate(valid = ifelse(is.na(in_libraries), 'de_novo',
                 ifelse(is.na(screen_hit_ratio), 'empirical_nonessential', 
                 ifelse(screen_hit_ratio > 0 & max_shr > 0.05, 'empirical_essential', 
                        'empirical_nonessential'))))

## fractions
n_a %>% count(valid) %>% mutate(frac = (n/sum(n)*100))
n_b %>% count(valid) %>% mutate(frac = (n/sum(n)*100))

## pie charts
n_a %>% count(valid) %>% 
  ggplot(aes(x='', y=n, fill=valid)) + 
  geom_bar(stat='identity') + coord_polar('y', start = 0) + 
  scale_fill_manual(values = c('#D3D3D3', '#053061', '#377EB8')) + 
  theme_nothing()

plot of chunk unnamed-chunk-27


n_b %>% count(valid) %>% 
  ggplot(aes(x='', y=n, fill=valid)) + 
  geom_bar(stat='identity') + coord_polar('y', start = 0) + 
  scale_fill_manual(values = c('#D3D3D3', '#053061', '#377EB8')) +
  theme_nothing()

plot of chunk unnamed-chunk-27

We plot the distribution of exon ranks/predicted off-targets per sgRNA.

## library guides with annotations
lib_df <- libA_seqs %>% dplyr::select(nopam) %>% mutate(lib = 'library A') %>% 
  bind_rows(libB_seqs %>% dplyr::select(nopam) %>% mutate(lib = 'library B')) %>%
  inner_join(pool)

## exon ranks
lib_df %>% 
  ggplot(aes(exon_rank)) + geom_histogram(bins=20) + 
  xlim(c(1,20)) + facet_wrap(~lib, ncol=1)

plot of chunk unnamed-chunk-28


## off-targets
lib_df %>% ggplot(aes(off_targets)) + 
  geom_histogram(bins=20) + facet_wrap(~lib, ncol=1)

plot of chunk unnamed-chunk-28

How is the HD CRISPR library composed of designs in previous libraries?

## load sequences that are also found with CLD
data('cld_seqs', package = 'HDCRISPR2019')

## generate upset plot
lib_df %>% split(.$lib) %>% walk(~{
  list(cld = cld_seqs,
     novartis = filter(.x, grepl('Novartis', in_libraries)) %>% .$nopam,
     brunello = filter(.x, grepl('Brunello', in_libraries)) %>% .$nopam,
     tkov3 = filter(.x, grepl('TKOv3', in_libraries)) %>% .$nopam,
     sabatini2014 = filter(.x, grepl('Sabatini2014', in_libraries)) %>% .$nopam,
     yusa2016 = filter(.x, grepl('Yusa2016', in_libraries)) %>% .$nopam,
     avana = filter(.x, grepl('Avana', in_libraries)) %>% .$nopam,
     tkov1 = filter(.x, grepl('TKOv1', in_libraries)) %>% .$nopam,
     gecko2 = filter(.x, grepl('GeCKOv2', in_libraries)) %>% .$nopam,
     sabatini = filter(.x, grepl('Sabatini', in_libraries)) %>% .$nopam
  ) %>% fromList() %>% upset(nsets=10)
})

We make a plot showing phenotypic range for individual guides.

effect_diff <- pool %>% group_by(genes_hit) %>%
  filter(n() > 2) %>%
  summarize(effect_diff = max(mean_effect) - min(mean_effect)) %>%
  ungroup() %>%
  mutate(group = 'all')
#> `summarise()` ungrouping output (override with `.groups` argument)

published_libs <- pool %>% 
  dplyr::select(nopam, genes_hit, mean_effect, in_libraries) %>%
  separate_rows(in_libraries, sep=',') %>%
  filter(!in_libraries %in% c('Sabatini2014', 'Sabatini2015', 'Novartis'),
         !is.na(in_libraries)) %>%
  group_by(genes_hit, in_libraries) %>%
  filter(n() > 2) %>%
  summarize(effect_diff = max(mean_effect) - min(mean_effect)) %>%
  ungroup() %>%
  dplyr::rename(group = in_libraries)
#> `summarise()` regrouping output by 'genes_hit' (override with `.groups` argument)

effect_hd_crispr_a <- pool %>% 
  filter(nopam %in% libA_seqs$nopam) %>% 
  group_by(genes_hit) %>%
  filter(n() > 2) %>%
  summarize(effect_diff = max(mean_effect) - min(mean_effect)) %>%
  ungroup() %>%
  mutate(group = 'HD CRISPR A')
#> `summarise()` ungrouping output (override with `.groups` argument)

effect_hd_crispr_b <- pool %>% 
  filter(nopam %in% libB_seqs$nopam) %>% 
  group_by(genes_hit) %>%
  filter(n() > 2) %>%
  summarize(effect_diff = max(mean_effect) - min(mean_effect)) %>%
  ungroup() %>%
  mutate(group = 'HD CRISPR B')
#> `summarise()` ungrouping output (override with `.groups` argument)

## plot, only genome-wide libraries
effect_diff %>% bind_rows(published_libs) %>%
  bind_rows(effect_hd_crispr_a) %>% 
  bind_rows(effect_hd_crispr_b) %>% 
  group_by(group) %>%
  mutate(median = median(effect_diff)) %>%
  ungroup() %>%
  arrange(median) %>%
  mutate(group = factor(group, levels = unique(group))) %>%
  ggplot(aes(group, effect_diff)) + 
  geom_boxplot() + 
  ylab('max(sgRNA effect) - min(sgRNA effect)') +
  coord_flip()

plot of chunk unnamed-chunk-30

Sequence characteristics

We examine how the HD CRISPR library compares to other libraries or random guides based on certain published sequence properties (e.g. Ruleset 2 score). We generate a file containing sgRNA sequences that we can evalute.

Ruleset 2 scores

We used the software published in Doench et al., 2016 to calculate ruleset 2 scores for various published libraries as well as the HD CRISPR libraries. We load and visualize the results.

data('rs2_scores', package = 'HDCRISPR2019')

## published libraries
rs2_scores %>% 
  ggplot(aes(rs2_score, color = in_libraries)) + 
  geom_density() + 
  scale_y_continuous(expand = c(0,0))

plot of chunk unnamed-chunk-31


## median brunello and random
med_brunello <- rs2_scores %>% filter(in_libraries == 'Brunello') %>% 
  pull(rs2_score) %>% median()
set.seed(1234)
med_random <- rs2_scores %>% slice_sample(n = 70000) %>% 
  pull(rs2_score) %>% median()

## with hd crispr
df <- lib_df %>% dplyr::select(lib, nopam) %>% 
  inner_join(rs2_scores %>% dplyr::select(nopam, rs2_score) %>% distinct()) %>%
  inner_join(bind_rows(n_a, n_b) %>% dplyr::select(nopam, valid)) %>%
  bind_rows(rs2_scores %>% distinct(rs2_score, nopam) %>%
              slice_sample(n = 70000) %>% mutate(lib = 'random')) %>%
  bind_rows(rs2_scores %>% dplyr::select(lib = in_libraries, rs2_score) %>%
              filter(lib %in% c('TKOv3', 'Brunello', 'GeCKOv2'))) %>%
  group_by(lib) %>% mutate(med = median(rs2_score)) %>% ungroup() %>%
  arrange(med) %>% 
  mutate(lib = factor(lib, levels = unique(lib)),
         group2 = ifelse(grepl('^library', lib), 'HD CRISPR',
                 ifelse(lib == 'random', 'random', 'reference')))
#> Joining, by = "nopam"
#> Joining, by = "nopam"

df %>% ggplot(aes(lib, rs2_score, fill = group2)) + 
  geom_violin() +  
  stat_summary(fun.y = 'median', fun.ymax = 'median', fun.ymin = 'median',
               geom = 'crossbar', color = 'black') + 
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  scale_fill_manual(values = c('#053061', '#aaaaaa', '#B2182B'))
#> Warning: `fun.y` is deprecated. Use `fun` instead.
#> Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
#> Warning: `fun.ymax` is deprecated. Use `fun.max` instead.

plot of chunk unnamed-chunk-31


df %>% filter(group2 == 'HD CRISPR') %>%
  unite(group, lib, valid, sep = '_', remove =F) %>%
  group_by(group) %>% mutate(med = median(rs2_score)) %>% ungroup() %>%
  ggplot(aes(group, rs2_score, fill = valid)) + 
  geom_violin(aes(group, rs2_score, fill = valid)) +  
  stat_summary(fun.y = 'median', fun.ymax = 'median', fun.ymin = 'median',
               geom = 'crossbar', color = 'black') + 
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  scale_fill_manual(values = c('#D3D3D3', '#053061', '#377EB8'))
#> Warning: `fun.y` is deprecated. Use `fun` instead.
#> Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
#> Warning: `fun.ymax` is deprecated. Use `fun.max` instead.

plot of chunk unnamed-chunk-31

Deep HF score

Deep HF scores were calculated using the software published be authors of the score (derrived from GitHub). We load the results and visualize them.

data('deephf_scores', package = 'HDCRISPR2019')

## visualize published libraries
deephf_scores %>% ggplot(aes(deephf_score, color = in_libraries)) + 
  geom_density() + scale_y_continuous(expand = c(0,0))

plot of chunk unnamed-chunk-32


## selected published libraries plus hd crispr
deephf_scores %>%
    bind_rows(lib_df %>% dplyr::select(in_libraries=lib, sequence = nopam) %>%
                  inner_join(deephf_scores %>% dplyr::select(sequence, deephf_score))) %>% 
  filter(in_libraries %in% c('TKOv3', 'TKOv1', 'GeCKOv2', 'Brunello', 'library A', 'library B')) %>% 
  ggplot(aes(deephf_score)) + geom_density(aes(color = in_libraries))  + 
  scale_y_continuous(expand = c(0,0))
#> Joining, by = "sequence"

plot of chunk unnamed-chunk-32

We visualize the Deep HF scores for the HD CRISPR libraries.

## median brunello and random
med_brunello <- deephf_scores %>% filter(in_libraries == 'Brunello') %>% 
  pull(deephf_score) %>% median()
set.seed(1234)
med_random <- deephf_scores %>% slice_sample(n = 70000) %>% 
  pull(deephf_score) %>% median()

## hd crispr empirical vs non-empirical
df <- lib_df %>% dplyr::select(lib, sequence = nopam) %>% 
  inner_join(deephf_scores) %>%
  inner_join(bind_rows(n_a, n_b) %>% dplyr::select(sequence = nopam, valid)) %>%
  bind_rows(deephf_scores %>% distinct(deephf_score, sequence) %>%
              slice_sample(n = 70000) %>% mutate(lib = 'random')) %>%
  bind_rows(deephf_scores %>% dplyr::select(lib = in_libraries, deephf_score) %>%
              filter(lib %in% c('TKOv3', 'Brunello', 'GeCKOv2'))) %>%
  group_by(lib) %>% mutate(med = median(deephf_score)) %>% ungroup() %>%
  arrange(med) %>% 
  mutate(lib = factor(lib, levels = unique(lib)),
         group = ifelse(grepl('^library', lib), 'HD CRISPR',
                 ifelse(lib == 'random', 'random', 'reference')))
#> Joining, by = "sequence"
#> Joining, by = "sequence"

df %>% ggplot(aes(lib, deephf_score, fill = group)) + 
  geom_violin() +  
  stat_summary(fun.y = 'median', fun.ymax = 'median', fun.ymin = 'median',
               geom = 'crossbar', color = 'black') + 
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  scale_fill_manual(values = c('#053061', '#aaaaaa', '#B2182B'))
#> Warning: `fun.y` is deprecated. Use `fun` instead.
#> Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
#> Warning: `fun.ymax` is deprecated. Use `fun.max` instead.

plot of chunk unnamed-chunk-33


df %>% filter(group == 'HD CRISPR') %>%
  unite(group, lib, valid, sep = '_', remove =F) %>%
  ggplot(aes(group, deephf_score, fill = valid)) + 
  geom_violin() +  
  stat_summary(fun.y = 'median', fun.ymax = 'median', fun.ymin = 'median',
               geom = 'crossbar', color = 'black') + 
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  scale_fill_manual(values = c('#D3D3D3', '#053061', '#377EB8'))
#> Warning: `fun.y` is deprecated. Use `fun` instead.
#> Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
#> Warning: `fun.ymax` is deprecated. Use `fun.max` instead.

plot of chunk unnamed-chunk-33

Hart gRNA sequence score

Hart et al. 2017 derived a sequence score that we can also used to evaluate guide RNAs in the HD CRISPR library. For this purpose a matrix was published containing a score for each nucleotide at each position of a 20-bp guide RNA. We load this matrix and use it to calculate the corresponding scores.

data('tko3_score', package = 'HDCRISPR2019')

## score all sequences, takes a while since not vectorized
tko3_scores <- pool %>% dplyr::select(in_libraries, nopam) %>%
  rowwise() %>% 
  mutate(tko3_score = PWMscoreStartingAt(tko3_score, nopam, starting.at = 1)) %>%
  ungroup()

## visualize the results
tko3_scores %>% separate_rows(in_libraries, sep = ',') %>%
  ggplot(aes(tko3_score, color = in_libraries)) + geom_density()

plot of chunk unnamed-chunk-34


## hd crispr library
set.seed(1234)
df <- lib_df %>% dplyr::select(lib, nopam) %>% 
  inner_join(tko3_scores %>% dplyr::select(-in_libraries) %>% distinct()) %>%
  inner_join(bind_rows(n_a, n_b) %>% dplyr::select( nopam, valid)) %>%
  bind_rows(tko3_scores %>% distinct(tko3_score, nopam) %>%
              slice_sample(n = 70000) %>% mutate(lib = 'random')) %>%
  bind_rows(tko3_scores %>% dplyr::select(lib = in_libraries, tko3_score) %>%
              filter(lib %in% c('TKOv3', 'Brunello', 'GeCKOv2'))) %>%
  group_by(lib) %>% mutate(med = median(tko3_score)) %>% ungroup() %>%
  arrange(med) %>% 
  mutate(lib = factor(lib, levels = unique(lib)),
         group = ifelse(grepl('^library', lib), 'HD CRISPR',
                 ifelse(lib == 'random', 'random', 'reference')))
#> Joining, by = "nopam"
#> Joining, by = "nopam"

df %>% ggplot(aes(lib, tko3_score, fill = group)) + 
  geom_violin() +  
  stat_summary(fun.y = 'median', fun.ymax = 'median', fun.ymin = 'median',
               geom = 'crossbar', color = 'black') + 
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  scale_fill_manual(values = c('#053061', '#aaaaaa', '#B2182B'))
#> Warning: `fun.y` is deprecated. Use `fun` instead.
#> Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
#> Warning: `fun.ymax` is deprecated. Use `fun.max` instead.

plot of chunk unnamed-chunk-34


df %>% filter(group == 'HD CRISPR') %>%
  unite(group, lib, valid, sep = '_', remove =F) %>%
  group_by(group) %>% mutate(med = median(tko3_score)) %>% ungroup() %>%
  ggplot(aes(group, tko3_score, fill = valid)) + 
  geom_violin() +  
  stat_summary(fun.y = 'median', fun.ymax = 'median', fun.ymin = 'median',
               geom = 'crossbar', color = 'black') + 
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  scale_fill_manual(values = c('#D3D3D3', '#053061', '#377EB8'))
#> Warning: `fun.y` is deprecated. Use `fun` instead.
#> Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
#> Warning: `fun.ymax` is deprecated. Use `fun.max` instead.

plot of chunk unnamed-chunk-34

Screen analysis

Data loading

In total we have performed 6 genome-wide dropout screens. We screened a bulk and two single cell clones with the sub-library A and B, respetively. We next load the raw count data for these experiments. The raw counts were determined from the sequencing data using Mageck-count.

data('screen_counts', package = 'HDCRISPR2019')

## combine datasets and format as long df
counts_df <- counts_a %>% dplyr::select(Plasmid = plasmid_LibA, everything()) %>% 
  gather(sample, count, -c(sgRNA, Gene)) %>% 
  mutate(library = 'A') %>% 
  bind_rows(counts_b %>% dplyr::select(-Pool_T0) %>% 
              gather(sample, count, -c(sgRNA, Gene)) %>% 
              mutate(library = 'B'))

We plot the skew ratio and check missing sgRNAs in both plasmid libraries.

counts_df %>% filter(sample == 'Plasmid') %>% 
  ggplot(aes(log(count+1))) + 
  geom_density(aes(color = library)) + 
  scale_y_continuous(expand=c(0,0)) + 
  scale_colour_manual(values = c('#85929E', '#2166AC'))

plot of chunk unnamed-chunk-36

counts_df %>% filter(sample == 'Plasmid') %>% 
  ggplot(aes(count)) + 
  geom_density(aes(color = library)) + 
  scale_y_continuous(expand=c(0,0)) + 
  scale_colour_manual(values = c('#85929E', '#2166AC'))

plot of chunk unnamed-chunk-36


## calculate skew ratio
counts_df %>% filter(sample == 'Plasmid') %>% 
  group_by(library) %>% 
  summarise(q10 = quantile(count, probs=0.1), 
            q90 = quantile(count, probs=0.9), 
            skew_ratio = q90/q10) %>% ungroup()

## how many guides are missing? which?
counts_df %>% filter(sample == 'Plasmid') %>% 
  filter(count == 0)  %>% count(library)
counts_df %>% filter(sample == 'Plasmid') %>% 
  filter(count == 0)

Normalization

First, we normalize each sample by dividing through the median count of all targeting controls. Before we do that we add a global pseudo count to avoid dividing by 0.

## list of non-targeting CTRLs
nontarg <- c('NONTARG', 'LacZ', 'EGFP', 'luciferase')
ctrls <- counts_df %>% distinct(sgRNA, Gene) %>% 
  filter(grepl('CONTROL', sgRNA)) %>% 
  mutate(ctrl_type = ifelse(Gene %in% nontarg, 'nontarg', 'targ'))

## median of targeting controls per library
targ_med <- counts_df %>% 
  filter(sgRNA %in% (filter(ctrls, ctrl_type == 'targ') %>% pull(sgRNA))) %>% 
  group_by(library, sample) %>% 
  summarise(med_count = median(count + 1)) %>% ungroup()

## perform normalization
norm_df <- counts_df %>% inner_join(targ_med) %>% 
  mutate(norm_count = ((count+1)/med_count)*median(count+1))

We plot log-scaled distributions of normalized counts for each sample. We also plot correlations of normalized counts between replicates.

## norm-count [log] distributions per sample
norm_df %>% ggplot(aes(log(norm_count))) + 
  geom_density(aes(colour = sample)) + 
  facet_wrap(~library) + 
  scale_y_continuous(expand=c(0, 0))

plot of chunk unnamed-chunk-38


## reproducibility at norm-count level
df_samples <- norm_df %>% filter(sample != 'Plasmid') %>% 
  separate(sample, c('type', 'time', 'rep'), sep='_', remove=F) %>% 
  dplyr::select(sgRNA, library, type, rep, norm_count) %>% 
  mutate(type = ifelse(type == 'bulk', 'Pool', type))

df_samples %>% spread(rep, norm_count) %>% 
  ggplot(aes(R1, R2)) + geom_hex(bins=100) + 
  geom_abline(linetype = 'dashed') +
  facet_grid(library ~ type)

plot of chunk unnamed-chunk-38


## correlation coefficients
df_samples %>% spread(rep, norm_count) %>% 
  group_by(library, type) %>% 
  summarise(PCC = cor(R1, R2, method='pearson'), 
            SCC = cor(R1, R2, method='spearman')) %>% ungroup()

Fold changes

We next calculate log2-scaled fold changes for each sample by dividing log2-scaled normalized counts for each sample by the log2-scaled normalized counts of the correpsonding plasmid libraries.

## get plasmid counts
plasmid_counts <- norm_df %>% filter(sample == 'Plasmid') %>% 
  distinct(library, sgRNA, norm_count) %>% 
  dplyr::select(plasmid_count = norm_count, everything())

## calculate log-fold-changes
fold_changes <- df_samples %>% inner_join(plasmid_counts) %>% 
  mutate(log2fc = log2(norm_count) - log2(plasmid_count)) %>% 
  ungroup()

We can now plot fold-change distributions for each of the samples.

fold_changes %>% ggplot(aes(log2fc)) + 
  geom_density(aes(colour = type)) + facet_wrap(~ library) + 
  scale_y_continuous(expand=c(0,0)) + 
  geom_vline(xintercept = 0, linetype = 'dashed')

plot of chunk unnamed-chunk-40

In addition we can compare dropout (quantified as log2-scaled fold changes) of core-essential genes compared to non-essential genes in each sample. We do this both for each replicate separately and for merged replicates (by averaging).

## data frame with target ENSG and gene symbol for each sgRNA
sgrna_targets <- counts_df %>% distinct(sgRNA, Gene) %>% 
  left_join(distinct(lib_df, genes_hit, ensg) %>% 
              dplyr::select(Gene=ensg, symbol=genes_hit))

## annotate fold changes with target gene info
fold_changes <- fold_changes %>% left_join(sgrna_targets)

## select core-/non-essential genes
cene <- fold_changes %>% filter(symbol %in% c(ce$symbol, ne$symbol)) %>% 
  mutate(gene_type = ifelse(symbol %in% ce$symbol, 'core_essential', 'non_essential'))

## plot fold changes of CEG vs. NEG
cene %>% ggplot(aes(log2fc, colour = gene_type, linetype = rep)) + 
  geom_density() + facet_grid(library ~ type) + 
  scale_y_continuous(expand = c(0,0))

plot of chunk unnamed-chunk-41


## same, but merging replicates
cene %>% group_by(library, type, gene_type, sgRNA) %>% 
  summarise(log2fc = mean(log2fc)) %>% ungroup() %>%
  ggplot(aes(log2fc, colour = gene_type)) + 
  geom_density(aes(linetype = type)) + facet_grid(~library) + 
  scale_y_continuous(expand = c(0,0)) +
  scale_colour_manual(values = c('#B2182B', '#85929E')) + 
  xlab('fold change [log2]')

plot of chunk unnamed-chunk-41

Finally we can also look at the reproducibility between replicates at the log2-fold change level.

## scatter plots
fold_changes %>% 
  dplyr::select(library, sgRNA, type, log2fc, rep) %>% 
  spread(rep, log2fc) %>% ggplot(aes(R1, R2)) + 
  geom_point_rast(colour = '#D3D3D3') + 
  facet_grid(library ~ type) + 
  geom_abline(linetype='dashed') + 
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  geom_hline(yintercept = 0, linetype = 'dashed')

plot of chunk unnamed-chunk-42


## correlation coefficients
fold_changes %>% 
  dplyr::select(library, sgRNA, type, log2fc, rep) %>% 
  spread(rep, log2fc) %>% 
  group_by(library, type) %>% 
  summarise(PCC = cor(R1, R2, method='pearson'), 
            SCC = cor(R1, R2, method='spearman')) %>% 
  ungroup()

Percentage ranks

We generate an alternative visualization showing percent-ranks for each sgRNA compared to the cumulative fraction.

fold_changes %>% 
  group_by(library, type, Gene, symbol, sgRNA) %>% 
  summarise(log2fc = mean(log2fc)) %>% ungroup() %>%
  mutate(gene_type = ifelse(symbol %in% ce$symbol, 'core-essential', 
                     ifelse(symbol %in% ne$symbol, 'non-essential', 
                     ifelse(Gene == 'NONTARG', 'non-targeting',
                                   'other')))) %>% 
  group_by(library, type) %>%
  mutate(perc_rank = percent_rank(log2fc)) %>% 
  ungroup() %>%
  filter(gene_type != 'other') %>%
  ggplot(aes(perc_rank, color = type, linetype = gene_type)) + 
  stat_ecdf() + 
  scale_x_continuous(expand = c(0,0)) + 
  scale_y_continuous(expand = c(0,0)) + 
  facet_wrap(~ library) +
  scale_colour_manual(values = c('#67001F', '#053061', '#4393C3')) + 
  xlab('percent-rank of sgRNAs') + 
  ylab('cumulative fraction')
#> `summarise()` regrouping output by 'library', 'type', 'Gene', 'symbol' (override with `.groups` argument)

plot of chunk unnamed-chunk-43

We show AUC values for these curves looking at individual replicates separately.

ecdf_aucs <- fold_changes %>% group_by(library, type, rep) %>%
  arrange(log2fc) %>%
  mutate(perc_rank = percent_rank(log2fc)) %>% 
  ungroup() %>%
  mutate(gene_type = ifelse(symbol %in% ce$symbol, 'core-essential', 
                     ifelse(symbol %in% ne$symbol, 'non-essential', 
                     ifelse(Gene == 'NONTARG', 'non-targeting',
                                   'other')))) %>% 
  filter(gene_type != 'other') %>%
  group_by(library, type, rep, gene_type) %>%
  group_modify(~{
    ## get ecdf function
    fun_ecdf <- ecdf(.x$perc_rank)
    .x %>% mutate(cum_frac = fun_ecdf(perc_rank))
  }) %>% 
  summarise(auc = pracma::trapzfun(approxfun(perc_rank, cum_frac,
                                             yleft = 0, yright = 1),
                                   a = 0, b = 1)$value) %>%
  ungroup()
#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values

#> Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
#> 'x' values
#> `summarise()` regrouping output by 'library', 'type', 'rep' (override with `.groups` argument)

## generate auc plot
ecdf_aucs %>% ggplot(aes(type, auc, color = gene_type)) + 
  geom_jitter(width = 0.1, size = 3) + 
  facet_wrap(~library) + 
  scale_y_continuous(limits = c(0.0, 1.0)) + 
  scale_colour_manual(values = c('#67001F', '#053061', '#4393C3')) + 
  xlab('') + ylab('AUC')

plot of chunk unnamed-chunk-44

## select three representative screens:
## K562 Sabatini, A375 Avana, Hela TKO
fc_select <- fchanges %>% filter(cellline %in% c('K562', 'DLD1'), 
                                 !pubmed %in% c('27661255', '26780180',
                                                '27260157', '28700943'))

## add inhouse TKO screen in Hap1 cells
data('hap1_tko1', package = 'HDCRISPR2019')

## hap1 hdcrispr screens
fc_combined <- fold_changes %>% 
  select(sgRNA, symbol, library, type, rep, log2fc) %>% 
  spread(rep, log2fc) %>% 
  mutate(log2fc = rowMeans(cbind(R1, R2)), 
         library = paste0('HDCRISPR_', library),
         cellline = paste('HAP1', type)) %>%
  dplyr::select(sgRNA, symbol, library, cellline, log2fc) %>%
  ## hap1 tko screen
  bind_rows(hap1_tko1 %>% select(-files) %>%
              rename(sgRNA = sequence) %>% 
              mutate(log2fc = rowMeans(cbind(R1, R1))) %>% 
              select(sgRNA, symbol, library, cellline, timepoint, log2fc)) %>%
  ## external screens
  bind_rows(fc_select %>% 
              mutate(library = ifelse(pubmed == '26472758', 'Sabatini',
                               ifelse(pubmed == '27260156', 'GeCKOv2',
                               ifelse(pubmed == '29083409', 'Avana',
                               ifelse(pubmed == '26627737', 'TKOv1', 'other'))))) %>%
              select(sgRNA = SEQID, symbol = GENE, library, 
                     cellline, log2fc = fc)) %>%
  mutate(type = gsub(' NA$', '', paste(library, cellline, timepoint)))

## density plot
fc_combined %>% 
  mutate(group = ifelse(grepl('HDCRISPR', library), 'HAP1 HDCRISPR',
                 ifelse(!is.na(timepoint), 'HAP1 TKO', 'Published screens')),
         gene_type = ifelse(symbol %in% ce$symbol, 'core-essential', 
                     ifelse(symbol %in% ne$symbol, 'non-essential', 
                      'other'))) %>% 
  filter(gene_type != 'other') %>%
  ggplot(aes(log2fc, color = type)) + 
  geom_density(aes(linetype = gene_type)) + 
  facet_wrap(~group, scales = 'free_y') + 
  scale_y_continuous(expand = c(0,0))

plot of chunk unnamed-chunk-45


## generate percentage rank plot
fc_combined %>% 
  group_by(type, symbol, sgRNA) %>% 
  summarise(log2fc = mean(log2fc)) %>% ungroup() %>%
  mutate(gene_type = ifelse(symbol %in% ce$symbol, 'core-essential', 
                     ifelse(symbol %in% ne$symbol, 'non-essential', 
                      'other'))) %>% 
  group_by(type) %>%
  mutate(perc_rank = percent_rank(log2fc)) %>% 
  ungroup() %>%
  filter(gene_type != 'other') %>%
  ggplot(aes(perc_rank, color = type, linetype = gene_type)) + 
  stat_ecdf() + 
  scale_x_continuous(expand = c(0,0)) + 
  scale_y_continuous(expand = c(0,0)) +
  xlab('percent-rank of sgRNAs') + 
  ylab('cumulative fraction')

plot of chunk unnamed-chunk-45


ecdf_aucs_combined <- fc_combined %>% group_by(type) %>%
  arrange(log2fc) %>%
  mutate(perc_rank = percent_rank(log2fc)) %>% 
  ungroup() %>%
  mutate(gene_type = ifelse(symbol %in% ce$symbol, 'core-essential', 
                     ifelse(symbol %in% ne$symbol, 'non-essential', 
                      'other'))) %>% 
  filter(gene_type != 'other') %>%
  group_by(library, type, gene_type) %>%
  group_modify(~{
    ## get ecdf function
    fun_ecdf <- ecdf(.x$perc_rank)
    .x %>% mutate(cum_frac = fun_ecdf(perc_rank))
  }) %>% 
  summarise(auc = pracma::trapzfun(approxfun(perc_rank, cum_frac,
                                             yleft = 0, yright = 1),
                                   a = 0, b = 1)$value) %>%
  ungroup()

## generate auc plot
ecdf_aucs_combined %>% arrange(desc(auc)) %>%
  mutate(type = factor(type, levels = unique(type))) %>%
  ggplot(aes(type, auc, color = gene_type)) + 
  geom_jitter(width = 0.1, size = 3) +
  scale_y_continuous(limits = c(0.35, 1.0)) + 
  scale_colour_manual(values = c('#67001F', '#053061', '#4393C3')) + 
  theme(axis.text.x = element_text(angle=45, hjust=1),
        panel.grid.major.y = element_line(color = '#dddddd'),
        panel.grid.minor.y = element_line(color = '#dddddd')) + 
  xlab('') + ylab('AUC')

plot of chunk unnamed-chunk-45

Hit calling with BAGEL

We use BAGEL to calculate Bayes Factors for each screen to determine essential genes. We run the tool separately for each sample type and each library. We also run the tool for each sample type based on combined libraries (such that we get 8 sgRNAs per sample). As input files we must first generate tables of log2-fold changes that we can feed into BAGEL.

## file for library A
fold_changes %>% filter(library == 'A') %>% 
  unite(sample, type, rep) %>% 
  dplyr::select(sgRNA, symbol, sample, log2fc) %>% 
  spread(sample, log2fc) %>% filter(!is.na(symbol)) %>%
  write_tsv('HDCRISPR_A_fold_changes.txt')

## file for library B
fold_changes %>% filter(library == 'B') %>% 
  unite(sample, type, rep) %>% dplyr::select(sgRNA, symbol, sample, log2fc) %>% 
  spread(sample, log2fc) %>% filter(!is.na(symbol)) %>%
  write_tsv('HDCRISPR_B_fold_changes.txt')

## one for combined libraries
fold_changes %>% filter(!grepl('CONTROL', sgRNA), !is.na(symbol)) %>% 
  unite(sample, type, rep) %>% 
  dplyr::select(sgRNA, symbol, sample, log2fc) %>% 
  spread(sample, log2fc) %>%
  write_tsv('HDCRISPR_combined_fold_changes.txt')

Now we can now execute BAGEL outisde of R to calculate Bayes Factors for each sample from the fold changes that we wrote. We use the CEG2 and NEG gene sets as training sets. After running BAGEL we can read the results back into R.

## load BAGEL bayes factors
data('bfs', package = 'HDCRISPR2019')

Precision-recall analysis

We generate PR curves for each screen.

## calculate curve data
pr_curves <- bfs %>%
  mutate(gene_type = ifelse(GENE %in% ce$symbol, 'core-essential',
                     ifelse(GENE %in% ne$symbol, 'non-essential', 'other'))) %>%
  filter(gene_type != 'other') %>%
  group_by(type, library) %>%
  group_modify(~{
    pred <- prediction(.x$BF, 
                       labels = ifelse(.x$gene_type == 'core-essential', 1, 0))
    perf <- performance(pred, measure = 'prec', x.measure = 'rec')
    auc <- performance(pred, measure = 'auc')@y.values[[1]]
    fdr5 <- perf@alpha.values[[1]][max(which(perf@y.values[[1]] > 0.97))]
    tibble(
      prec = perf@y.values[[1]],
      rec = perf@x.values[[1]],
      co = fdr5,
      auc = auc
    )
  }) %>% ungroup()

## draw curves
pr_curves %>% filter(library != 'combined') %>%
  ggplot(aes(rec, prec)) + geom_line(aes(colour = type)) +
  facet_grid(library~type) +
  geom_abline(linetype = 'dashed') + 
  xlab('recall') + ylab('precision') + 
  scale_colour_manual(values = c('#67001F', '#053061', '#4393C3'))

plot of chunk unnamed-chunk-48

We compare AUC values to screens in previously published libraries.

## AUCs for screens with the HD CRISPR lib.
hdcrispr_auc <- distinct(pr_curves, type, library, auc)

## AUCs in other screens selected for design.
genomecrispr_auc <- roc_results %>% bind_rows() %>%
  distinct(pubmed, cellline, condition, auc) %>%
  filter(auc > 0.9)

## compare aucs between both groups
hdcrispr_auc %>% mutate(lib = 'HDCRISPR') %>%
  bind_rows(genomecrispr_auc %>%
              mutate(lib = 'others')) %>% 
  ggplot(aes(auc, colour = lib)) + 
  geom_density() + 
  scale_y_continuous(expand = c(0,0)) + 
  scale_colour_manual(values = c('#4393C3', '#444444')) + 
  theme(legend.position = 'bottom')

plot of chunk unnamed-chunk-49

We determine whether a gene is considered 'essential' or not based on a BF > 6 threshold.

bfs <- bfs %>% mutate(essential = ifelse(BF > 6, T, F)) 

Comparison with previous Hap1 screens

Genome-scale Hap1 gene dropout screens have been conducted previously with the TKOv3 library (Hart et al., 2017) and with the GeneTrap system. We can compare our results to those obtained in these screens. For the TKOv3 screen, Bayes Factors are available so that's what we will look at as gene-level pheontype scores.

## previous screens in hap1 cells (gene trap and tkov1)
data('previous_hap1_screens', package = 'HDCRISPR2019')
data('tko3_bf_hap1', package = 'HDCRISPR2019')

We create a precision recall curve for the TKOv3 screen to check what the area under the curve is.

pr_with_tko3 <- bfs %>% filter(type == 'Pool') %>% 
  bind_rows(tko3_bf_hap1 %>% rename(BF = BF_tko3) %>% mutate(library = 'TKOv3')) %>%
  bind_rows(tko1_bf_hap1 %>% rename(BF = BF_HAP1) %>% filter(!is.na(BF)) %>% mutate(library = 'TKOv1')) %>%
  mutate(gene_type = ifelse(GENE %in% ce$symbol, 'core-essential',
                     ifelse(GENE %in% ne$symbol, 'non-essential', 'other'))) %>%
  filter(gene_type != 'other') %>%
  group_by(library) %>%
  group_modify(~{
    pred <- prediction(.x$BF, labels = ifelse(.x$gene_type == 'core-essential', 1, 0))
    perf <- performance(pred, measure = 'prec', x.measure = 'rec')
    auc <- performance(pred, measure = 'auc')@y.values[[1]]
    fdr5 <- perf@alpha.values[[1]][max(which(perf@y.values[[1]] > 0.97))]
    tibble(
      prec = perf@y.values[[1]],
      rec = perf@x.values[[1]],
      co = fdr5,
      auc = auc
    )
  }) %>% ungroup()

## draw curves
pr_with_tko3 %>% filter(library != 'combined') %>%
  ggplot(aes(rec, prec)) + geom_line(aes(colour = library)) +
  geom_abline(linetype = 'dashed') + 
  xlab('recall') + ylab('precision') + 
  scale_colour_manual(values = c('#67001F', '#053061', '#f4b400', '#2e8b57')) + 
  xlim(c(0.9, 1)) + ylim(c(0.9, 1))
#> Warning: Removed 5166 row(s) containing missing values (geom_path).

plot of chunk unnamed-chunk-52

We create scatter plots to see whether Bayes Factors for our screens correlate with the previously published HAP1 screens.

## datasets
comp_tko <- bfs %>% filter(type == 'Pool', library %in% c('A', 'B')) %>% 
  dplyr::select(GENE, BF, library) %>% 
  inner_join(tko1_bf_hap1)

comp_tko3 <- bfs %>% filter(type == 'Pool', library %in% c('A', 'B')) %>%
  dplyr::select(GENE, BF, library) %>% 
  inner_join(tko3_bf_hap1)

comp_genetrap <- bfs %>% filter(type == 'Pool', library %in% c('A', 'B')) %>%
  dplyr::select(GENE, BF, library) %>% 
  inner_join(genetrap_ess %>% dplyr::select(-q.val))

## scatter plots
comp_tko %>% ggplot(aes(BF, BF_HAP1)) + 
  geom_hex(bins=100, fill = '#d3d3d3') + 
  geom_point(data = subset(comp_tko, GENE %in% ce$symbol),
             aes(BF, BF_HAP1), colour = '#B2182B', size=0.5) +
  geom_point(data = subset(comp_tko, GENE %in% ne$symbol),
             aes(BF, BF_HAP1), colour = '#2166AC', size=0.5) +
  facet_wrap(~ library, scales = 'free_x') + geom_abline(linetype = 'dashed') + 
  geom_hline(yintercept = 0, linetype = 'dashed') + 
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  xlab('BF Hap1 HD CRISPR bulk') + ylab('BF Hap1 TKOv1')

plot of chunk unnamed-chunk-53


comp_tko3 %>% ggplot(aes(BF, BF_tko3)) + 
  geom_hex(bins=100, fill = '#d3d3d3') + 
  geom_point(data = subset(comp_tko3, GENE %in% ce$symbol),
             aes(BF, BF_tko3), colour = '#B2182B', size=0.5) +
  geom_point(data = subset(comp_tko3, GENE %in% ne$symbol),
             aes(BF, BF_tko3), colour = '#2166AC', size=0.5) +
  facet_wrap(~ library, scales = 'free_x') + geom_abline(linetype = 'dashed') + 
  geom_hline(yintercept = 0, linetype = 'dashed') + 
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  xlab('BF Hap1 HD CRISPR bulk') + ylab('BF Hap1 TKOv3')

plot of chunk unnamed-chunk-53


## correlation coefficients
comp_tko %>% drop_na() %>%
  group_by(library) %>%
  summarise(PCC = cor(BF, BF_HAP1), 
            SCC = cor(BF, BF_HAP1, method='spearman')) %>% 
  ungroup()

comp_tko3 %>% drop_na() %>%
  group_by(library) %>%
  summarise(PCC = cor(BF, BF_tko3), 
            SCC = cor(BF, BF_tko3, method='spearman')) %>% 
  ungroup()

## same for bloomen
comp_genetrap %>% ggplot(aes(BF, ratio)) + 
  geom_hex(bins=100, fill = '#d3d3d3') + 
  geom_point(data = subset(comp_genetrap, GENE %in% ce$symbol),
             aes(BF, ratio), colour = '#B2182B', size=0.5) +
  geom_point(data = subset(comp_genetrap, GENE %in% ne$symbol),
             aes(BF, ratio), colour = '#2166AC', size=0.5) +
  facet_wrap(~ library, scales='free_x') +
  geom_hline(yintercept = 0.5, linetype = 'dashed') + 
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  xlab('BF Hap1 HD CRISPR bulk') + ylab('Ratio sense vs. antisense gene trap')

plot of chunk unnamed-chunk-53


comp_genetrap %>% drop_na() %>%
  group_by(library) %>%
  summarise(PCC = cor(BF, ratio), 
            SCC = cor(BF, ratio, method='spearman')) %>% 
  ungroup()

We can also make a Venn diagram. We make two lists, representing libraries A and B, respectively. Accordingly, we make two Venn diagrams comparing screens in these libraries with existing experiments.

## HAP1 TKOv1 cutoff selected as described in Hart et al, 2017
venn_a <- list(
  gt = genetrap_ess %>% filter(q.val < 0.05) %>% pull(GENE),
  tkov1 = tko1_bf_hap1 %>% filter(BF_HAP1 > 6) %>% pull(GENE),
  tkov3 = tko3_bf_hap1 %>% filter(BF_tko3 > 6) %>% pull(GENE),
  hd_pool_a = bfs %>% filter(library == 'A', essential, type == 'Pool') %>% pull(GENE)
)

## HAP1 TKOv1 cutoff selected as described in Hart et al, 2017
venn_b <- list(
  gt = genetrap_ess %>% filter(q.val < 0.05) %>% pull(GENE),
  tkov1 = tko1_bf_hap1 %>% filter(BF_HAP1 > 6) %>% pull(GENE),
  tkov3 = tko3_bf_hap1 %>% filter(BF_tko3 > 6) %>% pull(GENE),
  hd_pool_b = bfs %>% filter(library == 'B', essential, type == 'Pool') %>% pull(GENE)
)

## plot venn diagrams
gplots::venn(venn_a)

plot of chunk unnamed-chunk-54

gplots::venn(venn_b)

plot of chunk unnamed-chunk-54

We can also compare Hap1 cells to other cell lines. Specifically we compare Hap1 cells to KBM7 cells, previously published screens in human embryonic stem cells (Yilmaz et al., 2018 in Nat. Cell. Biol.) and then maybe some random cancer cell lines.

## read gene-level crispr phenotyeps in hesc cells
data('hesc_crispr', package = 'HDCRISPR2019')

## venn diagram comparing different lines
venn_comp <- list(
  ## kbm7 essential genes
  KBM7 = bagel_results %>% bind_rows() %>% 
    filter(cellline == 'KBM7', 
           pubmed != '24336569', 
           condition == 'viability') %>% 
    filter(BF > 6) %>% pull(GENE),
  ## HCT116
  HT29 = bagel_results %>% bind_rows() %>% 
    filter(cellline == 'HT29', 
           condition == 'viabilityafter25days') %>% 
    filter(BF > 6) %>% pull(GENE),
  ## HAP1
  HAP1 = bfs %>% 
    filter(library == 'combined', type == 'Pool') %>% 
    filter(BF > 6)  %>% pull(GENE),
  ## hESC
  hESC = hesc_crispr %>% filter(cscore < 0, FDR < 0.05) %>% pull(symbol)
)

## draw venn
gplots::venn(venn_comp)

plot of chunk unnamed-chunk-55

We make a box plot visualizing the phenotypic space (max fold change - min. fold change of all guides targeting a gene) comparing HD CRISPR and TKO libraries in Hap1 cells.

## tkov3 differences
data('tkov3_fc', package = 'HDCRISPR2019')

## hd crispr
hdcrispr_bulk_diff <- fold_changes %>%
  filter(type == 'Pool', library %in% c('A', 'B')) %>% 
  dplyr::select(sgRNA, library, rep, log2fc, symbol) %>%
  spread(rep, log2fc) %>% 
  mutate(avg_fc = rowMeans(cbind(R1, R2)),
         avg_fc = avg_fc / sd(avg_fc)) %>% 
  group_by(library, symbol) %>% 
  summarise(diff = max(avg_fc) - min(avg_fc)) %>% 
  ungroup() %>%
  mutate(library = paste('HD CRISPR', library))
#> `summarise()` regrouping output by 'library' (override with `.groups` argument)

hdcrispr_all_diff <- fold_changes %>%
  filter(library %in% c('A', 'B')) %>% 
  dplyr::select(sgRNA, library, type, rep, log2fc, symbol) %>%
  spread(rep, log2fc) %>% 
  mutate(avg_fc = rowMeans(cbind(R1, R2)),
         avg_fc = avg_fc / sd(avg_fc)) %>% 
  group_by(library, type, symbol) %>% 
  summarise(diff = max(avg_fc) - min(avg_fc)) %>% 
  ungroup() %>%
  mutate(library = paste('HD CRISPR', library))
#> `summarise()` regrouping output by 'library', 'type' (override with `.groups` argument)

## tkov1 differences
data('tkov1_fc', package = 'HDCRISPR2019')

## visualize as boxplot
overlapping_symbols <- intersect(intersect(tkov3_fc$symbol, tkov1_fc$symbol),
                                 hdcrispr_bulk_diff$symbol)
tkov1_fc %>% bind_rows(tkov3_fc) %>% 
  bind_rows(hdcrispr_bulk_diff) %>%
  filter(symbol %in% overlapping_symbols) %>% 
  group_by(library) %>% mutate(median = median(diff)) %>% ungroup() %>%
  arrange(median) %>% mutate(library = factor(library, levels = unique(library))) %>%
  ggplot(aes(library, diff)) + geom_boxplot() +
  ylab('max(log fold change) - min(log fold change)') + 
  coord_flip()

plot of chunk unnamed-chunk-56


## visualize differences between bulk and single cell
hdcrispr_all_diff %>%
  unite(screen, library, type) %>%
  group_by(screen) %>% mutate(median = median(diff)) %>% ungroup() %>%
  arrange(median) %>% mutate(screen = factor(screen, levels = unique(screen))) %>%
  ggplot(aes(screen, diff)) + geom_boxplot() +
  ylab('max(log fold change) - min(log fold change)') + 
  coord_flip()

plot of chunk unnamed-chunk-56

Context-dependent essential genes

We compare every hit in our screen with the fraction of previous screens that the gene showed a phenotype in. To this end we load pre-computed values indicating for each gene the fraction of cell lines that it is essential in.

## add promiscuity score
data('pscores', package = 'HDCRISPR2019')

## highlight yamanaka
yamanaka <- c('NANOG', 'MYC', 'POU5F1', 'KLF4', 'SOX2')
off_targets <- c('ARGFX', 'CNIH4', 'ZNF676', 'SUN3', 'TCEANC')

plot_df <- bfs %>% left_join(pscores) %>% 
  filter(library == 'combined', type == 'Pool')

ggplot() + 
  geom_point(data = plot_df, aes(BF, pscore), 
             colour = '#dddddd') + 
  geom_point(data = filter(plot_df, GENE %in% yamanaka),
             aes(BF, pscore), colour = '#2166AC') +
  geom_point(data = filter(plot_df, GENE %in% off_targets), 
             aes(BF, pscore), colour = '#B2182B') +
  geom_text_repel(data = filter(plot_df, 
                                GENE %in% c(yamanaka, off_targets)), 
                  aes(BF, pscore, label = GENE)) +
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  geom_hline(yintercept = 0.5, linetype = 'dashed') +
  xlab('Bayes Factor (combined libraries)') +
  ylab('fraction of screens where gene is essential')

plot of chunk unnamed-chunk-57

The genes that are essential in most previous screens in cancer cell lines but are not found with the HD CRISPR library are mostly unknown / uncharacterized genes. These might be off-target effects in the Avana library that has been used for most screens. We make a plot comparing the Bayes Factors for these genes across different libraries that were used to design the HD CRISPR library.

## annotate library
lib_anno <- tibble(
  pubmed = c('26472758', '26627737', '24336569', '27260157', '26780180', 
             '27760321', '27260156', '27869803', '28145866', '28162770', 
             '29083409', '28700943'),
  library= c('Sabatini2015', 'TKOv1', 'Sabatini2014', 'Novartis', NA, 
             'Yusa2016', 'GeCKOv2', 'TKOv1', 'GeCKOv2', 'Sabatini2017', 
             'Avana', 'Sabatini2015')
)

## normalize bayes factors
bf_mat <- bind_rows(bagel_results) %>% inner_join(lib_anno) %>% 
  unite(screen, cellline, condition, library) %>% 
  dplyr::select(GENE, BF, screen) %>%
  bind_rows(bfs %>% filter(library == 'combined') %>% 
              mutate(screen = paste('HAP1', type, library, sep='_')) %>% 
              dplyr::select(GENE, BF, screen)) %>%
  reshape2::acast(GENE ~ screen, value.var = 'BF')

## quantile normalization
bf_mat_norm <- normalize.quantiles(bf_mat)
colnames(bf_mat_norm) <- colnames(bf_mat)
rownames(bf_mat_norm) <- rownames(bf_mat)

## draw box plot to see whether phenotypes are library specific
as_tibble(bf_mat_norm, rownames='GENE') %>% 
  filter(GENE %in% off_targets) %>% gather(screen, BF, -GENE) %>% 
  filter(!grepl('KRAS', screen)) %>% 
  separate(screen, c('cellline', 'condition', 'library')) %>% 
  filter(library != 'NA') %>%
  ggplot(aes(library, BF)) + geom_boxplot() + 
  facet_wrap(~GENE) + 
  geom_hline(yintercept = 0, linetype = 'dashed') + 
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab('normalized Bayes Factor')

plot of chunk unnamed-chunk-58

HAP1 specific essential genes

In the above scatter plot we saw context-specific Hap1 essential genes in the bottom right corner, and broadly essential genes that do not lead to a phenotype in HAP1 cells in the top left corner. We can pick a cutoff to select genes in the bottom right corner (HAP1 specific essential genes) and perform a gene set enrichment analysis to check whether there are overrepresented pathways.

## list of hap1 essentials ordered by specificity
hap1_specific <- plot_df %>% filter(BF > 6, pscore < 0.4)

## convert gene symbol to entrez id
data('symbol_to_entrez', package = 'HDCRISPR2019')

## add entrez to hap1 essentials
hap1_specific <- hap1_specific %>% 
  inner_join(symbol_to_entrez %>% dplyr::select(GENE=symbol, entrez)) %>% 
  arrange(BF)

## perform gene set overrepresentation analysis on hap1-specific essentials
goa_out <- limma::kegga(hap1_specific$entrez, 
                 universe = symbol_to_entrez %>% 
                   filter(symbol %in% plot_df$GENE) %>% 
                   pull(entrez) %>% unique(), 
                 species = 'Hs')
top_go <- limma::topKEGG(goa_out)

This reveals a strong enrichment for regulators of the Fanconi Anemia pathway. We make a heatmap that shows certain genes of interest and their essentiality across a number of cell lines.

## hap1-specific essential genes
hap1_specific <- bfs %>% 
  filter(library == 'combined', type == 'Pool') %>% 
  dplyr::select(GENE, BF) %>% mutate(cellline = 'HAP1')

## essential genes for a number of other cell lines
other_cl <- bagel_results %>% bind_rows() %>% 
  filter(cellline %in% c('KBM7', 'K562', 'WM115', 'AGS', 'MDAMB453')) %>%
  filter(pubmed %in% c('26472758', '29083409')) %>%
  dplyr::select(GENE, BF, cellline)

## gene sets
yamanaka <- c('MYC', 'POU5F1', 'KLF4', 'SOX2', 'NANOG')
oncogenes <- c('KRAS', 'BRAF', 'PIK3CA', 'ERBB2', 'BCR', 'CTNNB1')
core_ess <- c('POLR2A', 'RPL7', 'SF1')
fanc <- c('FANCA', 'FANCB', 'FANCC', 'FANCE', 'FANCF', 'FANCG',
          'FANCL', 'FANCM', 'FAAP20', 'FAAP100', 'FAAP24', 
          'UBE2T', 'FANCD2', 'FANCI')

## draw essentiality heat map
hap1_specific %>% bind_rows(other_cl) %>% 
  mutate(ess = ifelse(BF > 6, 1, 0)) %>% 
  bind_rows(hesc_crispr %>%
              mutate(ess = ifelse(cscore < 0 & FDR < 0.05, 1, 0),
                     cellline = 'hESC') %>% 
              dplyr::select(GENE=symbol, ess, cellline)) %>%
  filter(GENE %in% c(yamanaka, oncogenes, core_ess, fanc)) %>% 
  reshape2::acast(cellline ~ GENE, value.var = 'ess') %>% 
  pheatmap::pheatmap(color = c('#dddddd', '#db4437'),
                     border_color = '#ffffff')

plot of chunk unnamed-chunk-60

Do we see the Yamanaka and Fanconi Anemia dropouts in other Hap1 screens, too?

genetrap_ess %>% 
  mutate(ess = ifelse(q.val < 0.05, 1, 0), 
         screen = 'Gene trap') %>% 
  dplyr::select(symbol = GENE, ess, screen) %>%
  bind_rows(
    tko1_bf_hap1 %>% 
      mutate(ess = ifelse(BF_HAP1 > 6, 1, 0), 
             screen = 'TKOv1') %>% 
      dplyr::select(symbol = GENE, ess, screen)
  ) %>%
  bind_rows(
    bfs %>% filter(library != 'combined') %>%
      unite(screen, library, type) %>% 
      dplyr::select(ess = essential, screen, symbol = GENE) %>%
      mutate(ess = ifelse(ess, 1, 0))
  ) %>%
  filter(symbol %in% c(yamanaka, fanc)) %>% 
  acast(screen ~ symbol, value.var = 'ess') %>%
  pheatmap(color = c('#dddddd', '#db4437'), 
           border_color = '#ffffff',
           na_col = '#ffffff')

plot of chunk unnamed-chunk-61

Hit calling with individual sub-libraries compared to the combined library

BAGEL

We want to observe whether we achieve increased performance, when we combine both libraries for hit calling. We first have a look at the BAGEL results.

## use BF > 6 as cutoff like in Hart et al.
bfs %>% mutate(essential = ifelse(BF > 6, T, F)) %>% 
  dplyr::count(library, type, essential) %>% filter(essential) %>% 
  ggplot(aes(library, n, fill = type)) + 
  geom_bar(stat='identity', position='dodge') + 
  ylab('number of essential genes') + 
  ylim(c(0,2500)) +
  scale_fill_manual(values = c('#67001F', '#053061', '#4393C3')) + 
  theme(legend.position = 'bottom')

plot of chunk unnamed-chunk-62


## plot bf distributions for each screen
bfs %>% ggplot(aes(BF, colour = type)) + 
  geom_density() + facet_wrap(~library) +
  scale_y_continuous(expand = c(0,0)) +
  scale_color_manual(values = c('#67001F', '#053061', '#4393C3')) 

plot of chunk unnamed-chunk-62

Mageck RRA

Another commonly used pipeline that is frequently used to analyze CRISPR screens and does not require prior information in the form of core- and non-essential genes is Mageck RRA. We run this algorithm on both individual and combined libraries to see how many hits can be identified.

We first have to run MAGeCK on libraries A, B and the combined set. We do this outside of R using the MAGeCK command line tool with default parameters. We can then load the results back into R for visualization. We create similar bar graphs as we did for the BAGEL results above, using a 5% FDR cutoff to determine essential genes.

## load combined mageck results
data('mageck_res', package = 'HDCRISPR2019')

## similar bar graphs as we made for the BAGEL analysis
mageck_res %>% mutate(essential = ifelse(FDR < 0.05 & lfc < 0, T, F)) %>% 
  filter(essential) %>% dplyr::count(lib, sample) %>%
  ggplot(aes(lib, n, fill=sample)) + 
  geom_bar(stat='identity', position = 'dodge') + 
  ylim(c(0, 2500)) +
  theme(legend.position = 'bottom') + 
  scale_fill_manual(values = c('#67001F', '#053061', '#4393C3')) +
  ylab('number of essential genes')

plot of chunk unnamed-chunk-63

gscreend (Imkeller et al.)

gscreend can model the assymetry in CRISPR dropout screens, which might detect more essential genes than MAGeCK at low library coverage. We run gscreend on all HD CRISPR library screens. The code below takes some time to run. Therefore we also include precomputed results with this R package.

## add experiment information to counts df
counts_df_gsc <- norm_df %>% filter(!grepl('CONTROL', sgRNA)) %>%
  mutate(sample = gsub('Pool', 'bulk', sample)) %>%
  separate(sample, c('cellline', 'time', 'rep'), 
           sep='_', fill = 'right', remove=F)

## add 'combined' library
counts_df_gsc <- counts_df_gsc %>% 
  dplyr::select(sgRNA, Gene, sample, library, cellline, time, rep, count) %>% 
  bind_rows(counts_df_gsc %>% 
              mutate(library = 'combined', 
                     norm_count = round(norm_count, digits = 0)) %>% 
              dplyr::select(sgRNA, Gene, sample, library, 
                            cellline, time, rep, count = norm_count))
## run gscreened on each library
gscreend_out <- counts_df_gsc %>% distinct(cellline, library) %>% 
  filter(cellline != 'Plasmid') %>% 
  mutate(gscreend = map2(cellline, library, ~{
    ## extract data needed for analysis
    hdcrispr_df <- counts_df_gsc %>% 
      filter(cellline %in% c('Plasmid', .x), library == .y) %>% 
      dplyr::select(-c(time, rep, cellline)) %>%
      spread(sample, count) %>%
      dplyr::select(sgRNA, Gene, library, Plasmid, everything())

    ## we first need a count matrix, three columns, plasmid and Tx
    counts_matrix <- as.matrix(cbind(hdcrispr_df[,4:6]))

    ## we then make an annotation object for rows
    rowData <- data.frame(sgRNA_id = hdcrispr_df$sgRNA,
                          gene = hdcrispr_df$Gene)

    ## same for the columns
    colData <- data.frame(samplename = c("library", "R1", "R2"),
                          # timepoint naming convention: 
                          # T0 -> reference, 
                          # T1 -> after proliferation
                          timepoint = c("T0", "T1", "T1"))

    ## use these to make a summarized experiment object              
    se <- SummarizedExperiment(assays=list(counts=counts_matrix),
                               rowData=rowData, colData=colData)

    ## use this object to run gscreend
    pse <- createPoolScreenExp(se)
    pse_an <- RunGscreend(pse)

    ## return top table
    as_tibble(ResultsTable(pse_an)) %>%
      mutate(library = .y,
             sample = .x)
  }))

We visualize the results as a bar plot.

## visualize results as bar plot
gscreend_out %>% dplyr::select(-library) %>% 
  unnest(gscreend) %>% 
  mutate(essential = ifelse(fdr < 0.05 & lfc < 0, T, F)) %>% 
  filter(essential) %>% dplyr::count(sample, library) %>% 
  ggplot(aes(library, n, fill=sample)) + 
  geom_bar(stat='identity', position = 'dodge') + 
  scale_fill_manual(values = c('#67001F', '#053061', '#4393C3')) + 
  ylim(c(0, 2500)) + ylab('number of essential genes') + 
  theme(legend.position = 'bottom')

plot of chunk unnamed-chunk-66

Comparison of hits determined by MAGeCK and BAGEL

To see whether BAGEL and MAGeCK agree in terms of the essential genes that they determine, we generate Venn diagrams

## hits called by mageck
mageck_hits <- mageck_res %>% 
  mutate(essential = ifelse(FDR < 0.05 & lfc < 0, T, F)) %>%
  filter(essential) %>%
  dplyr::select(library = lib, sample, symbol) %>%
  distinct()

## hits called by bagel
bagel_hits <- bfs %>% 
  mutate(essential = ifelse(BF > 6, T, F),
         type = ifelse(type == 'Pool', 'Bulk', type)) %>%
  filter(essential) %>%
  dplyr::select(library, sample = type, symbol = GENE) %>%
  distinct()

## overlap between clones with mageck
mageck_venn <- mageck_hits %>% split(.$library) %>% 
  map(~ .x %>% split(.$sample) %>% 
        map(function(x) x$symbol))

walk(names(mageck_venn), 
     ~ plot(euler(mageck_venn[[.x]]), 
            quantities = T, 
            main = paste('MAGeCK library', .x)))

## overlap between clones with bagel
bagel_venn <- bagel_hits %>% split(.$library) %>% 
  map(~ .x %>% split(.$sample) %>% 
        map(function(x) x$symbol))

walk(names(bagel_venn), 
     ~ plot(euler(mageck_venn[[.x]]), 
            quantities = T, 
            main = paste('BAGEL library', .x)))

We further make pairwise comparisons between the tools for each library and cell type individually.

## compare n hits bagel vs mageck for different condition
venn_bagel_vs_mageck <- bagel_hits %>%
  mutate(tool = 'BAGEL') %>% 
  bind_rows(mageck_hits %>% mutate(tool = 'MAGeCK')) %>%
  unite(screen, library, sample) %>%
  split(.$screen) %>%
  map(~ .x %>% split(.$tool) %>% map(function(x) x$symbol))

venn_diagrams <- map(names(venn_bagel_vs_mageck), 
     ~ plot(euler(venn_bagel_vs_mageck[[.x]]), 
            quantities = T, 
            main = .x))

## print plots to canvase
n <- length(venn_diagrams)
nCol <- floor(sqrt(n))
do.call("grid.arrange", c(venn_diagrams, ncol=nCol))

plot of chunk unnamed-chunk-68

Cutting and non-cutting sgRNAs

We notice that there is a small but distinct phenotypic difference between the targeting and non-targeting sgRNA controls in the HD CRISPR library. We explore how well these can actually be separated from each other. This might be useful to inform whether a gene targeting a non-essential gene might cut the DNA or not.

## a data frame containing control phenotypes
ctrl_df <- fold_changes %>% filter(grepl('_C_', sgRNA)) %>% 
  mutate(control_type = ifelse(Gene %in% nontarg, 'non-targeting', 'targeting')) %>% 
  bind_rows(fold_changes %>% filter(symbol %in% ne$symbol) %>% 
              mutate(control_type = 'non-essential'))

## plot control fold changes for all ctrl sgrnas
ctrl_df %>% ggplot(aes(log2fc)) + 
  geom_density(aes(colour = control_type)) + 
  facet_grid(library ~ type) + 
  scale_y_continuous(expand = c(0,0)) + 
  panel_border() + 
  xlim(c(-5, 5)) + 
  xlab('fold change [log2]')

plot of chunk unnamed-chunk-69


## plot specifically for SCC12 library A
ctrl_df %>% filter(library == 'A', type == 'SCC12', 
                   control_type != 'non-essential') %>%
  ggplot(aes(log2fc)) + 
  geom_density(aes(colour = control_type)) +
  scale_y_continuous(expand = c(0,0)) + 
  xlim(c(-5, 5))

plot of chunk unnamed-chunk-69

## set for indeterministic mixture model
set.seed(1234)

df_cutting_noncutting <- purrr::map_df(c('A', 'B'), function(lib){
  ## data for fitting
  logfcs <- ctrl_df %>% 
    filter(library == lib, 
           type == 'SCC12', 
           control_type != 'non-essential')

  ## starting parameters for mixture model
  start_params <- logfcs %>% group_by(control_type) %>% 
    summarise(med = median(log2fc), mad = mad(log2fc))

  ## fit mixture model, 3 components
  mm <- normalmixEM(logfcs$log2fc, k = 4,
                    lambda = c(1, 1, 0.25, 0.25),
                    mu = c(start_params$med[1], start_params$med[2], -2, -4), 
                    sigma = c(start_params$mad[1], start_params$mad[2], 3, 3))

  ## plot fit
  plot(mm, which = 2, breaks = 50)

  ## extract parameters
  params_nt <- list(mu = mm$mu[1], sigma = mm$sigma[1])
  params_targ <- list(mu = mm$mu[2], sigma = mm$sigma[2])

  ## compare fit to targeting/non-targeting controls
  logfcs %>% 
    mutate(log2fc_sim_nt = rnorm(n(), mean = params_nt$mu, sd = params_nt$sigma), 
           log2fc_sim_targ = rnorm(n(), mean = params_targ$mu, sd = params_targ$sigma)) %>% 
    ggplot() + 
    geom_density(aes(log2fc, color = control_type)) + 
    geom_density(aes(log2fc_sim_nt), color = '#cccccc', linetype='dashed') + 
    geom_density(aes(log2fc_sim_targ), color = '#f4b400', linetype='dashed') + 
    geom_vline(xintercept = start_params$med, linetype='dashed') + 
    scale_y_continuous(expand = c(0, 0))

  ## extract posterior probabilities from model
  post_probs <- as_tibble(cbind(mm$x, mm$posterior), .name_repair = 'minimal') %>% 
    `colnames<-`(c('closest_log2fc', 'p_nontarg', 'p_targ', 'p_pheno', 'p_strong_pheno'))

  ## annotate closest value in posterior probability table
  closest_anno <- fold_changes %>% filter(library == lib, type == 'SCC12') %>% 
    group_by(library, symbol, sgRNA) %>% 
    summarise(log2fc = mean(log2fc)) %>% ungroup() %>%
    rowwise() %>% 
    mutate(closest = which.min(abs(post_probs$closest_log2fc - log2fc))) %>% 
    ungroup()

  closest_anno %>% 
    bind_cols(post_probs %>% dplyr::slice(closest_anno$closest)) %>%
    mutate(p_pheno = p_pheno + p_strong_pheno) %>% 
    dplyr::select(-p_strong_pheno)
})
#> `summarise()` ungrouping output (override with `.groups` argument)
#> number of iterations= 359
#> `summarise()` regrouping output by 'library', 'symbol' (override with `.groups` argument)
#> `summarise()` ungrouping output (override with `.groups` argument)

plot of chunk unnamed-chunk-70

#> number of iterations= 681
#> `summarise()` regrouping output by 'library', 'symbol' (override with `.groups` argument)

plot of chunk unnamed-chunk-70

Do the closest control fold changes correspond well to the closest annotated fold changes?

## scatter plot comparing matched values
df_cutting_noncutting %>% 
  ggplot(aes(log2fc, closest_log2fc)) + 
  geom_hex(bins = 100) + 
  facet_wrap(~ library)

plot of chunk unnamed-chunk-71

Now we can assign to each sgRNA whether it's likely non-cutting, cutting or cutting with a phenotype, or unclear.

df_cutting_noncutting <- df_cutting_noncutting %>% 
  mutate(pheno_group = ifelse(p_targ > 0.8, 'likely cutting', 
                       ifelse(p_nontarg > 0.8, 'likely not cutting', 
                       ifelse(p_pheno > 0.8 & log2fc < 0, 'growth phenotype', 
                              'unclear'))))

## phenotype ranges
df_cutting_noncutting %>% 
  group_by(library, pheno_group) %>% 
  summarise(min_fc = min(log2fc), max_fc = max(log2fc)) %>% 
  ungroup()
#> `summarise()` regrouping output by 'library' (override with `.groups` argument)
#> # A tibble: 8 x 4
#>   library pheno_group         min_fc  max_fc
#>   <chr>   <chr>                <dbl>   <dbl>
#> 1 A       growth phenotype   -12.2   -1.81  
#> 2 A       likely cutting      -1.07   0.0517
#> 3 A       likely not cutting   0.977  1.70  
#> 4 A       unclear             -1.81   3.07  
#> 5 B       growth phenotype   -10.5   -2.06  
#> 6 B       likely cutting      -1.07  -0.459 
#> 7 B       likely not cutting   0.565  1.86  
#> 8 B       unclear             -2.06   3.65

Now we can summarize for the whole screen how many sgRNAs are cutting, non-cutting or have a phenotype.

## results from bagel analysis
mageck_essential <- mageck_res %>% 
  filter(sample == 'SCC12', lib == 'combined') %>% 
  filter(lfc < 0, FDR < 0.05, !is.na(symbol)) %>% 
  pull(symbol)

## annotate sgrnas for plotting
for_plotting <- df_cutting_noncutting %>% 
  filter(!grepl('CONTROL', sgRNA)) %>% 
  mutate(reference_group = ifelse(symbol %in% ce$symbol, 'core-essential', 
                           ifelse(symbol %in% ne$symbol, 'nonessential', 
                                  'other')), 
         gene_group = ifelse(symbol %in% mageck_essential, 
                             'HAP1 essential', 'HAP1 non-essential')) %>%
  inner_join(libA_seqs %>% mutate(sgRNA = gsub('>', '', guide_id)) %>% 
               inner_join(n_a %>% dplyr::select(nopam, valid)) %>% 
               dplyr::select(sgRNA, valid) %>%
               mutate(library = 'A') %>%
               bind_rows(
                 libB_seqs %>% mutate(sgRNA = gsub('>', '', guide_id)) %>% 
                   inner_join(n_b %>% dplyr::select(nopam, valid)) %>% 
                   dplyr::select(sgRNA, valid) %>%
                   mutate(library = 'B')
               ))
#> Joining, by = "nopam"
#> Joining, by = "nopam"
#> Joining, by = c("library", "sgRNA")

## reference gene sets
for_plotting %>% filter(!grepl('CONTROL', sgRNA)) %>%
  filter(symbol %in% c(ce$symbol, ne$symbol)) %>%
  count(library, gene_group, pheno_group) %>% 
  group_by(library, gene_group) %>% 
  mutate(perc = n/sum(n) * 100) %>% 
  ungroup() %>% 
  ggplot(aes(gene_group, perc, fill = pheno_group)) + 
  geom_bar(stat = 'identity', position = 'dodge') + 
  facet_wrap(~library, ncol = 1) +
  ggtitle('HAP1 SCC12, library A, reference gene sets')

plot of chunk unnamed-chunk-73


## mageck hits, without reference genes
for_plotting %>% 
  filter(!grepl('CONTROL', sgRNA),
         !symbol %in% c(ce$symbol, ne$symbol)) %>%
  mutate(gene_group = ifelse(symbol %in% mageck_essential, 
                             'HAP1 essential', 'HAP1 non-essential')) %>%
  count(library, gene_group, pheno_group) %>% 
  ggplot(aes(gene_group, n, fill = pheno_group)) + 
  geom_bar(stat = 'identity', position = 'dodge') + 
  facet_wrap(~library, ncol = 1) +
  ggtitle('HAP1 SCC12, library A, excluding reference gene sets')

plot of chunk unnamed-chunk-73

Finally, we also compare differences between empirically designed and not empirically designed sgRNAs.

## plot with reference genes
for_plotting %>% count(library, gene_group, valid, pheno_group) %>% 
  ggplot(aes(valid, n, fill = pheno_group)) + 
  geom_bar(stat='identity', position = 'dodge') + 
  facet_grid(library~gene_group) + 
  panel_border() + 
  xlab('empirical sgRNA') + ylab('percentage of sgRNAs in group')

plot of chunk unnamed-chunk-74


## without reference genes
for_plotting %>% filter(reference_group != 'core-essential') %>%
  count(library, gene_group, valid, pheno_group) %>% 
  complete(library, gene_group, valid, pheno_group, fill = list(n = 0)) %>%
  ggplot(aes(valid, n, fill = pheno_group)) + 
  geom_bar(stat='identity', position = 'dodge') + 
  facet_grid(library~gene_group) + 
  panel_border() + 
  xlab('empirical sgRNA') + ylab('number of sgRNAs in group')

plot of chunk unnamed-chunk-74

Differential essentiality between SCC and Bulk

Are there strong clonality effects where genes are more essential in one clone compared to other clones or the bulk population? We first make a Venn diagram of all essential genes detected in bulk and single cell clone populations.

map(c('combined', 'A', 'B'), function(lib_type){
  ## overlap between clones with bagel
  scc_hits <- bfs %>% filter(library == lib_type) %>% 
    mutate(essential = ifelse(BF > 6, T, F)) %>% 
    filter(essential) %>% split(.$type) %>% 
    map(~ .x$GENE)

  plot(euler(scc_hits), quantities = T, 
       main = 'Cell population-specific essential genes',
       col = c('#67001F', '#4393C3', '#053061'),
       fill = NA)
})

plot of chunk unnamed-chunk-75plot of chunk unnamed-chunk-75plot of chunk unnamed-chunk-75

Overall, the single-cell clone-specific essential genes are fewer than expected. We generate scatter plots to compare SCC11 and SCC12 Bayes Factors to the bulk population for both sub-libarries and the combined library.

## scatter plots
map(c('combined', 'A', 'B'), function(lib_type){
  map(c('SCC11', 'SCC12'), function(clone){
    bfs %>% filter(library == lib_type, type %in% c('Pool', clone)) %>%
      dplyr::select(GENE, BF, type) %>% 
      spread(type, BF) %>% dplyr::rename(scc = !!clone) %>% 
      mutate(scc_spec = ifelse(Pool < 6 & scc > 6, T, F)) %>%
      ggplot(aes(Pool, scc, colour = scc_spec)) + 
      ggrastr::geom_point_rast(size=1) + 
      geom_abline(linetype='dashed') + 
      geom_hline(yintercept = 0, linetype = 'dashed') + 
      geom_vline(xintercept = 0, linetype = 'dashed') + 
      scale_color_manual(values = c('#dddddd', '#db4437')) + 
      theme(legend.position = 'none') + 
      ggtitle(paste(lib_type, clone))
  })
})

plot of chunk unnamed-chunk-76plot of chunk unnamed-chunk-76plot of chunk unnamed-chunk-76plot of chunk unnamed-chunk-76plot of chunk unnamed-chunk-76plot of chunk unnamed-chunk-76

We perform a gene set enrichment analysis to determine whether there are specific pathways or biological processes that are essential only in bulk or single cell cloens.

## a list of all avana library genes
hdcrispr_genes <- unique(bfs$GENE)
## get entrez gene IDs
hdcrispr_genes <- bitr(hdcrispr_genes, 
                      fromType = 'SYMBOL', 
                      toType = 'ENTREZID', 
                      OrgDb = org.Hs.eg.db)

## convert cluster genes to entrez id
go_results <- map(c('combined', 'A', 'B'), function(lib_type){
  df <- bfs %>% filter(library == lib_type) %>% 
    dplyr::select(GENE, BF, type) %>% 
    spread(type, BF)

  ## SCC11-specific genes
  genes_scc11 <- df %>% filter(SCC11 > 6, SCC12 < 6, Pool < 6) %>% pull(GENE)
  genes_scc11 <- hdcrispr_genes %>% 
    filter(SYMBOL %in% genes_scc11) %>% pull(ENTREZID)
  ## SCC12-specific genes
  genes_scc12 <- df %>% filter(SCC11 < 6, SCC12 > 6, Pool < 6) %>% pull(GENE)
  genes_scc12 <- hdcrispr_genes %>% 
    filter(SYMBOL %in% genes_scc12) %>% pull(ENTREZID)
  ## bulk-specific genes
  genes_bulk <- df %>% filter(SCC11 < 6, SCC12 < 6, Pool > 6) %>% pull(GENE)
  genes_bulk <- hdcrispr_genes %>% 
    filter(SYMBOL %in% genes_bulk) %>% pull(ENTREZID)

  ## run go over-representation analysis
  go_enr <- map(list(genes_scc11, genes_scc12, genes_bulk), ~{
    goana(.x, 
          universe = hdcrispr_genes$ENTREZID, 
          FDR = 0.01, species = 'Hs') %>%
      as_tibble(rownames = 'go_id') %>% filter(Ont == 'BP') %>%
      arrange(P.DE) %>% mutate(FDR = p.adjust(P.DE, method = 'BH')) %>%
      filter(FDR < 0.05)
  })
})

Session info

sessionInfo()
#> R version 3.6.2 (2019-12-12)
#> Platform: x86_64-apple-darwin15.6.0 (64-bit)
#> Running under: macOS Catalina 10.15.6
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] forcats_0.5.0               stringr_1.4.0              
#>  [3] dplyr_1.0.2                 purrr_0.3.4                
#>  [5] readr_1.3.1                 tidyr_1.1.2                
#>  [7] tibble_3.0.3                tidyverse_1.3.0            
#>  [9] ggrepel_0.8.2               ggplot2_3.3.2              
#> [11] cowplot_1.1.0               ggrastr_0.2.1              
#> [13] pheatmap_1.0.12             gridExtra_2.3              
#> [15] UpSetR_1.4.0                eulerr_6.1.0               
#> [17] reshape2_1.4.4              SummarizedExperiment_1.16.1
#> [19] DelayedArray_0.12.3         BiocParallel_1.20.1        
#> [21] matrixStats_0.57.0          GenomicRanges_1.38.0       
#> [23] GenomeInfoDb_1.22.1         gscreend_1.0.0             
#> [25] Biostrings_2.54.0           XVector_0.26.0             
#> [27] mixtools_1.2.0              preprocessCore_1.48.0      
#> [29] org.Hs.eg.db_3.10.0         AnnotationDbi_1.48.0       
#> [31] IRanges_2.20.2              S4Vectors_0.24.4           
#> [33] Biobase_2.46.0              BiocGenerics_0.32.0        
#> [35] limma_3.42.2                clusterProfiler_3.14.3     
#> [37] ROCR_1.0-11                 HDCRISPR2019_0.1.0         
#> 
#> loaded via a namespace (and not attached):
#>   [1] readxl_1.3.1           backports_1.1.10       fastmatch_1.1-0       
#>   [4] plyr_1.8.6             igraph_1.2.5           polylabelr_0.2.0      
#>   [7] splines_3.6.2          urltools_1.7.3         digest_0.6.25         
#>  [10] GOSemSim_2.12.1        viridis_0.5.1          GO.db_3.10.0          
#>  [13] fansi_0.4.1            magrittr_1.5           memoise_1.1.0         
#>  [16] graphlayouts_0.7.0     modelr_0.1.8           enrichplot_1.6.1      
#>  [19] prettyunits_1.1.1      colorspace_1.4-1       rvest_0.3.6           
#>  [22] blob_1.2.1             haven_2.3.1            xfun_0.17             
#>  [25] hexbin_1.28.1          crayon_1.3.4           RCurl_1.98-1.2        
#>  [28] jsonlite_1.7.1         survival_3.2-3         glue_1.4.2            
#>  [31] polyclip_1.10-0        gtable_0.3.0           zlibbioc_1.32.0       
#>  [34] kernlab_0.9-29         fGarch_3042.83.2       scales_1.1.1          
#>  [37] DOSE_3.12.0            DBI_1.1.0              Rcpp_1.0.5            
#>  [40] viridisLite_0.3.0      progress_1.2.2         gridGraphics_0.5-0    
#>  [43] bit_4.0.4              europepmc_0.4          timeSeries_3062.100   
#>  [46] httr_1.4.2             fgsea_1.12.0           gplots_3.1.0          
#>  [49] RColorBrewer_1.1-2     ellipsis_0.3.1         spatial_7.3-12        
#>  [52] pkgconfig_2.0.3        farver_2.0.3           dbplyr_1.4.4          
#>  [55] utf8_1.1.4             labeling_0.3           ggplotify_0.0.5       
#>  [58] tidyselect_1.1.0       rlang_0.4.7            munsell_0.5.0         
#>  [61] cellranger_1.1.0       tools_3.6.2            cli_2.0.2             
#>  [64] generics_0.0.2         RSQLite_2.2.0          broom_0.7.0           
#>  [67] ggridges_0.5.2         evaluate_0.14          fs_1.5.0              
#>  [70] knitr_1.30             bit64_4.0.5            tidygraph_1.2.0       
#>  [73] caTools_1.18.0         ggraph_2.0.3           pracma_2.2.9          
#>  [76] DO.db_2.9              xml2_1.3.2             rstudioapi_0.11       
#>  [79] compiler_3.6.2         beeswarm_0.2.3         reprex_0.3.0          
#>  [82] tweenr_1.0.1           stringi_1.5.3          highr_0.8             
#>  [85] lattice_0.20-41        fBasics_3042.89.1      Matrix_1.2-18         
#>  [88] nloptr_1.2.2.2         vctrs_0.3.4            pillar_1.4.6          
#>  [91] lifecycle_0.2.0        BiocManager_1.30.10    triebeard_0.3.0       
#>  [94] data.table_1.12.8      bitops_1.0-6           qvalue_2.18.0         
#>  [97] R6_2.4.1               KernSmooth_2.23-17     vipor_0.4.5           
#> [100] gtools_3.8.2           MASS_7.3-53            assertthat_0.2.1      
#> [103] withr_2.3.0            GenomeInfoDbData_1.2.2 hms_0.5.3             
#> [106] grid_3.6.2             timeDate_3043.102      rvcheck_0.1.8         
#> [109] segmented_1.2-0        Cairo_1.5-12.2         ggforce_0.3.2         
#> [112] lubridate_1.7.9        ggbeeswarm_0.6.0